Python wordsの例、nltk.corpus.swadesh.words Pythonの例

コード例 #1

0

ファイルを表示

def fun5():
    # 比较词表
    # 表格词典的另一个例子是比较词表。NLTK中包含了所谓的斯瓦迪士核心词列表(Swadesh
    # wordlists),包括几种语言的约200个常用词的列表。语言标识符使用ISO639双字母码
    from nltk.corpus import swadesh
    print swadesh.fileids()
    print swadesh.words('en')
    # 可以通过使用entries()
    # 方法来指定一个语言链表来访问多语言中的同源词。而且, 还可以把它转换成一
    # 个简单的词典
    fr2en = swadesh.entries(['fr', 'en'])
    print fr2en
    translate = dict(fr2en)
    print translate['chien']
    print translate['jeter']

コード例 #2

0

ファイルを表示

ファイル: naiveBayes.py プロジェクト: golthitarun/Machine_Learning_Projects

def test_util():
    count = 0
    error = 0
    cachedStopWords = set(stopwords.words("english"))
    cachedCommonWords = set(swadesh.words('en'))
    removewords = cachedStopWords.union(cachedCommonWords)
    for i in Test_class:
        path = os.path.join(test, i)
        for path, subdirs, files in os.walk(path):
            for name in files:
                doc_words = []
                flag = 0
                f = open(os.path.join(path, name), "r")
                for line in f:
                    if flag == 1:
                        tokenizer = nltk.RegexpTokenizer(r'\w+')
                        tokens = tokenizer.tokenize(line)
                        filtered_word = [
                            word.lower() for word in tokens
                            if word.lower() not in removewords > 2
                        ]
                        for k in filtered_word:
                            doc_words.append(k)
                    if ("Lines:" in line):
                        flag = 1
                test_cls = testNB(doc_words)
                if (i == test_cls):
                    count += 1
                else:
                    error += 1
                    count += 1
    print "Accuracy :", float((float(count - error) / float(count)) * 100)

コード例 #3

0

ファイルを表示

ファイル: article.py プロジェクト: kozikowskik/newspaper

    def _calculate_languages_ratios(self, text):
        """
        Calculate probability of given text to be written in several languages and
        return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}

        @param text: Text whose language want to be detected
        @type text: str

        @return: Dictionary with languages and unique stopwords seen in analyzed text
        @rtype: dict
        """

        languages_ratios = {}

        tokens = wordpunct_tokenize(text)
        words = [word.lower() for word in tokens]

        # Compute per language included in nltk number of unique stopwords appearing in analyzed text
        for language in swadesh.fileids():
            stopwords_set = set(swadesh.words(language))
            words_set = set(words)
            common_elements = words_set.intersection(stopwords_set)

            languages_ratios[language] = len(common_elements)  # language "score"

        return languages_ratios

コード例 #4

0

ファイルを表示

ファイル: naiveBayes.py プロジェクト: golthitarun/Machine_Learning_Projects

def Vocabulary(n):
    cachedStopWords = set(stopwords.words("english"))
    cachedCommonWords = set(swadesh.words('en'))
    removewords = cachedStopWords.union(cachedCommonWords)
    for i in classlist:
        local_cnt = collections.Counter()
        count = 0
        path = os.path.join(root, i)
        for path, subdirs, files in os.walk(path):
            for name in files:
                n += 1
                count += 1
                flag = 0
                f = open(os.path.join(path, name), "r")
                for line in f:
                    if flag == 1:
                        tokenizer = nltk.RegexpTokenizer(r'\w+')
                        tokens = tokenizer.tokenize(line)
                        filtered_word = [
                            word.lower() for word in tokens
                            if word.lower() not in removewords
                        ]
                        for k in filtered_word:
                            cnt[k] += 1
                            local_cnt[k] += 1
                    if ("Lines:" in line):
                        flag = 1
        Nc[i] = count
        List_cnt[i] = local_cnt
    return n

コード例 #5

0

ファイルを表示

ファイル: nlt21.py プロジェクト: Twenkid/Swadesh-list-NLP-NLTK

def swadesh():
    for i in swadesh.words(): 
      s = str(j) + "-" + str(j%length[top])
      
      #s = str(j)
      #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1
      f.write((i+ "\r\n").encode('utf-8')); j+=1
      all = all+1
      #if (all == length[top]): all = 0; top+=1
    f.close()
    #from nltk.corpus import swadesh

コード例 #6

0

ファイルを表示

def compareWordlist():

    swadesh.fileids()
    swadesh.words('en')

    fr2en = swadesh.entries(['fr', 'en'])
    fr2en

    translate = dict(fr2en)
    translate['chien']
    translate['jeter']

    de2en = swadesh.entries(['de', 'en'])    # German-English
    es2en = swadesh.entries(['es', 'en'])    # Spanish-English
    translate.update(dict(de2en))
    translate.update(dict(es2en))
    translate['Hund']
    translate['perro']

    languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
    for i in [139, 140, 141, 142]:
        print swadesh.entries(languages)[i]

コード例 #7

0

ファイルを表示

ファイル: predict_length.py プロジェクト: anna-hope/talaffoz

def test_swadesh(model, lang) -> Tuple[Optional[float], Optional[List]]:
    swadesh_langs = set(swadesh.fileids())
    if lang in swadesh_langs:
        logging.info('Testing model on Swadesh list for {}...'.format(lang))
        # some entries in the swadesh list have multiple words
        # because they include contextual definitions
        # so we need to only take the first word
        words = swadesh.words(fileids=lang)
        words = [word.split()[0].casefold() for word in words]
        accuracy, errors = test_accuracy(words, model)
    else:
        logging.error('No Swadesh corpus for "{}"'.format(lang))
        accuracy = None
        errors = None
    return accuracy, errors

コード例 #8

0

ファイルを表示

ファイル: testImplemented.py プロジェクト: Wiki-G/wikiG-app

 def removeStopwords(self, inlist):
     stop_words1 = stopwords.words('english')
     stop_words2 = swadesh.words('en')
     finalstopwords = stop_words1 + stop_words2
     #here we can add any other words that we need to add to stopwords
     finalstopwords = finalstopwords + ['ref','also','title','http','image','cite','nbsp','disambiguation','article','articles','pages','page','wikipedia','retrieved','category','categories']
     templist = []
     for items in inlist:
         templist.append(items.lower())
         
         
     for words in finalstopwords:
     	while words in templist:
     		templist.remove(words)
     
     
     
     return templist

コード例 #9

0

ファイルを表示

ファイル: testImplemented.py プロジェクト: kapiltekwani/B.E.-Project-Work

    def removeStopwords(self, inlist):
        stop_words1 = stopwords.words('english')
        stop_words2 = swadesh.words('en')
        finalstopwords = stop_words1 + stop_words2
        #here we can add any other words that we need to add to stopwords
        finalstopwords = finalstopwords + [
            'ref', 'also', 'title', 'http', 'image', 'cite', 'nbsp',
            'disambiguation', 'article', 'articles', 'pages', 'page',
            'wikipedia', 'retrieved', 'category', 'categories'
        ]
        templist = []
        for items in inlist:
            templist.append(items.lower())

        for words in finalstopwords:
            while words in templist:
                templist.remove(words)

        return templist

コード例 #10

0

ファイルを表示

ファイル: main.py プロジェクト: DevHyung/GiCorpus-Pdf-To-Text-Analysis

def preprocessGICorpus():
    giCorpus = {}
    corpus_root = os.getcwd() + "/GICorpus/"

    filelists = PlaintextCorpusReader(corpus_root, '.*\.txt', encoding='utf-8')
    wnl = nltk.WordNetLemmatizer()
    print filelists.fileids()
    for file in filelists.fileids():
        wordlist = filelists.words(file)
        print "Printing size of  " + file + " original wordlist: " + str(
            len(wordlist))
        trimmedWordlist = [
            x for x in wordlist
            if not (x in swadesh.words('en')) and len(x) >= 1
        ]
        # lemmatizedWordlist = [wnl.lemmatize(t) for t in trimmedWordlist]
        taggedWordlist = nltk.pos_tag(trimmedWordlist)
        print "Printing size of  " + file + " trimmed wordlist: " + str(
            len(trimmedWordlist))
        giCorpus[file] = taggedWordlist
        # fd = FreqDist(w for w in taggedWordlist)
    return giCorpus

コード例 #11

0

ファイルを表示

def getSwadesh(bSave=False):  #W path="sw.txt"):
    j = 0
    f = open("sw.txt", "wb")
    length = [207, 207, 207, 207, 207, 174, 207, 207]
    top = 0
    all = 0
    j = 0
    for i in swadesh.words():
        s = str(j) + "-" + str(j % length[top])
        #s = str(j)
        #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1
        if (bSave): f.write((i + "\r\n").encode('utf-8'))
        else: print(i)
        j += 1
        all = all + 1
        #if (all == length[top]): all = 0; top+=1

    if (bSave):
        e = swadesh.entries()
        for n in e:
            f.write((str(n) + "\r\n").encode('utf-8'))
    if (bSave): f.close()

コード例 #12

0

ファイルを表示

def content_fraction2(text):
    compared_words = swadesh.words('en')
    content2 = [w for w in text if w.lower() not in compared_words]
    return content2

コード例 #13

0

ファイルを表示

ファイル: 06.py プロジェクト: kouheiszk/nltk

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import swadesh

swadesh.fileids()
swadesh.words("en")
fr2en = swadesh.entries(["fr", "en"])
fr2en
translate = dict(fr2en)
translate["chien"]
translate["jeter"]

コード例 #14

0

ファイルを表示

#!/usr/local/bin/python2.7.3
# -*- coding: utf-8 -*-
#given three files in english,swedish and greek
#the script will produce 3 cleaner versions of them
#3 tokenized versions of the cleaned versions
#and 3 files containing the lines erased
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import swadesh
english_stopwords = stopwords.words('english')
swedish_stopwords = stopwords.words('swedish')

english_common_words = swadesh.words('en')

sv_eng_sim_in_sw_stopwords = ['i', 'till', 'dig', 'under']
for word in sv_eng_sim_in_sw_stopwords:
    if word in english_stopwords:
        english_stopwords.remove(word)

common_words_eng_swed = ['I', 'not', 'all', 'small', 'man', 'dog',\
 'bark', 'fat', 'hand', 'drink', 'live', 'hit', 'dig', 'lie', 'fall', \
 'river', 'lake', 'salt', 'sand', 'red', 'full', 'bad', 'far', 'in', 'and']

for word in common_words_eng_swed:
    if word in english_common_words:
        english_common_words.remove(word)


def read_files():
    path_el = 'corpus_el.txt'

コード例 #15

0

ファイルを表示

ファイル: cleaner.py プロジェクト: limbero/project

#!/usr/local/bin/python2.7.3
# -*- coding: utf-8 -*-
#given three files in english,swedish and greek
#the script will produce 3 cleaner versions of them
#3 tokenized versions of the cleaned versions
#and 3 files containing the lines erased
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import swadesh
english_stopwords = stopwords.words('english')
swedish_stopwords = stopwords.words('swedish')

english_common_words = swadesh.words('en')

sv_eng_sim_in_sw_stopwords = ['i', 'till', 'dig', 'under']
for word in sv_eng_sim_in_sw_stopwords:
	if word in english_stopwords:
		english_stopwords.remove(word)

common_words_eng_swed = ['I', 'not', 'all', 'small', 'man', 'dog',\
 'bark', 'fat', 'hand', 'drink', 'live', 'hit', 'dig', 'lie', 'fall', \
 'river', 'lake', 'salt', 'sand', 'red', 'full', 'bad', 'far', 'in', 'and']

for word in common_words_eng_swed:
	if word in english_common_words:
		english_common_words.remove(word)


def read_files():
	path_el = 'corpus_el.txt'

コード例 #16

0

ファイルを表示

ファイル: nltk1.py プロジェクト: Twenkid/Swadesh-list-NLP-NLTK

from nltk.corpus import swadesh
import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

#import sys, locale, os
#print(sys.stdout.encoding)
#print(sys.stdout.isatty())
#print(locale.getpreferredencoding())
#print(sys.getfilesystemencoding())
#print(os.environ["PYTHONIOENCODING"])
#print(chr(246), chr(9786), chr(9787))

j = 0
f = open("sw.txt", "wb")

length = [207, 207, 207, 207, 207, 174, 207, 207]
top = 0
all = 0

for i in swadesh.words():
    s = str(j) + "-" + str(j % length[top])

    #s = str(j)
    #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1
    f.write((i + "\r\n").encode('utf-8'))
    j += 1
    all = all + 1
    #if (all == length[top]): all = 0; top+=1
f.close()
#from nltk.corpus import swadesh

コード例 #17

0

ファイルを表示

ファイル: C0204.py プロジェクト: zhuyuanxiang/NLTK-Python-CN

# 在词典中寻找单词的发音
text = ['natural', 'language', 'processing']
pron_list = [ph for w in text for ph in prondict[w][0]]
print("word pronoun list= ", pron_list)

# 加[0]是因为natural有两个发音，取其中一个就好了
pron_list = [ph for w in text for ph in prondict[w]]
print("'natural' pronoun list= ", pron_list)
print("prondict['natural']=", prondict['natural'])

# P70 2.4.3 比较词表（Swadesh wordlists）
# 包括几种语言的约200个常用词的列表，可以用于比较两个语言之间的差别，也可以用于不同语言的单词翻译
from nltk.corpus import swadesh

print("swadesh.fileids()= ", swadesh.fileids())
print("swadesh.words('en')= ", swadesh.words('en'))

fr2en = swadesh.entries(['fr', 'en'])
print("fr2en= ", fr2en[:13])
translate = dict(fr2en)
print("translate= ", translate)
print("translate['chien']= ", translate['chien'])

de2en = swadesh.entries(['de', 'en'])
translate.update(dict(de2en))
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(es2en))
print("translate= ", translate)

print("translate['jeter']= ", translate['jeter'])
print("translate['Hund']= ", translate['Hund'])

コード例 #18

0

ファイルを表示

from nltk.corpus import swadesh

print(swadesh.fileids(), '\n')
print(swadesh.words('en'), '\n')

fr2en = swadesh.entries(['fr', 'en'])
print(fr2en, '\n')

translate = dict(fr2en)
print(translate['chien'])
print(translate['jeter'], '\n')

de2en = swadesh.entries(['de', 'en'])  # German-English
es2en = swadesh.entries(['es', 'en'])  # Spanish-English

translate.update(dict(de2en))
translate.update(dict(es2en))

print(translate['Hund'])
print(translate['perro'], '\n')

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']

for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])

コード例 #19

0

ファイルを表示

ファイル: corpus.py プロジェクト: spinto88/Lector_y_analisis

uso de la base de datos .xml.
"""
from lxml import etree
from datetime import date
from datetime import timedelta
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import numpy as np
import codecs

week_days = ['Lun', 'Mar', 'Mier', 'Jue', 'Vier', 'Sab', 'Dom']

# Auxiliar functions and things 
from nltk.corpus import swadesh
from nltk.corpus import stopwords
common_words = swadesh.words('es') + stopwords.words('spanish')

def phrase_variation(phrase):

    phrase_lower = phrase.lower()
    phrase_upper = phrase.upper()
    phrase_capitalize = phrase.capitalize()
    phrase_title = phrase.title()

    phrases2check = [phrase, phrase_lower, \
                 phrase_upper, phrase_capitalize, \
                 phrase_title]

    return phrases2check

# Class Note

コード例 #20

0

ファイルを表示

ファイル: traducciones.py プロジェクト: Naranjoinc/NLP

from nltk.corpus import swadesh

##Que hay dentro de swadesh. que idiomas
print(swadesh.fileids())

##Saber que palabras tiene swadesh en ingles
print(swadesh.words('en'))

##objeto que me ayuda a definir un diccionario par atraducir las palabras del frances al español
fr2es = swadesh.entries(['fr', 'en'])
print(fr2es)

##crear diccionario de frances a ingles
translate = dict(fr2es)
print(translate['chien'])  ##traducir la plabra chien en frances al ingles

コード例 #21

0

ファイルを表示

ファイル: pos_count.py プロジェクト: 3pin/textbot

def mostCommon(dataArray):
    import nltk
    from nltk.corpus import stopwords
    from nltk.corpus import swadesh
    from nltk import bigrams
    import re
    import json
    import operator
    from collections import Counter, OrderedDict
    import string
    from bs4 import UnicodeDammit
    #
    # additional words to omit
    # extra_words = ['be', 'psb']
    extra_words = []
    #
    stringAll = ''
    emoticons_str = r"""
        (?:
            [:=;] # Eyes
            [oO\-]? # Nose (optional)
            [D\)\]\(\]/\\OpP] # Mouth
        )"""
    regex_str = [
        emoticons_str,
        r'<[^>]+>',  # HTML tags
        r'(?:@[\w_]+)',  # @-mentions
        r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
        r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
        r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
        r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
        r'(?:[\w_]+)',  # other words
        r'(?:[\uD83C-\uDBFF\uDC00-\uDFFF]+)',
        r'(?:\S)'  # anything else
    ]
    tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                           re.VERBOSE | re.IGNORECASE)
    emoticon_re = re.compile(r'^' + emoticons_str + '$',
                             re.VERBOSE | re.IGNORECASE)

    #
    def tokenize(s):
        return tokens_re.findall(s)

    #
    def preprocess(s, lowercase=False):
        tokens = tokenize(s)
        if lowercase:
            tokens = [
                token if emoticon_re.search(token) else token.lower()
                for token in tokens
            ]
        return tokens

    #
    for row in dataArray:
        for item in row:
            stringAll = stringAll + item
        stringAll = stringAll + " "
    #print('stringAll...' + '\n' + stringAll)
    #
    count_all = Counter()
    # count everything
    terms_all = [term for term in preprocess(stringAll) if len(term) > 1]
    #
    # Count terms only once, equivalent to Document Frequency
    terms_single = set(terms_all)
    #
    # build up the words to omit...
    punctuation = list(string.punctuation)
    common_words = swadesh.words('en')
    stop_words = stopwords.words('english')
    omitted_words = punctuation + common_words + stop_words + extra_words
    #
    # Create a list with all the terms EXCLUDING punctuations
    terms_nostop = [
        term for term in preprocess(stringAll)
        if term not in omitted_words and len(term) > 1
    ]
    # Count hashtags only
    terms_hash = [
        term for term in preprocess(stringAll)
        if term.startswith('#') and len(term) > 1
    ]
    # Count terms only (no hashtags, no mentions)...startswith() takes a tuple (not a list) if we pass a list of inputs
    terms_only = [
        term for term in preprocess(stringAll) if term not in stop_words
        and not term.startswith(('#', '@')) and len(term) > 1
    ]
    # Count duples (co-occurances)
    terms_bigram = bigrams(terms_nostop)
    # Update the counter
    count_all.update(terms_nostop)
    # Print the most frequent words
    print(count_all.most_common(20))
    #
    sorted_dictionary = OrderedDict(
        sorted(count_all.items(), key=lambda t: t[1], reverse=True))
    print(sorted_dictionary.keys())
    # sort by VAL in descending order
    #print( sorted(count_all.values()) )
    #count_all_sorted = sorted(count_all.items(), key=operator.itemgetter(1))
    #print(count_all_sorted)
    '''
    # ask user for number which Most_Common_Count must be above, then create new dict with Top_Counts
    from tkinter.simpledialog import askstring
    shortdict_tresh = askstring("PromptWindow", "Enter cutoff integer in POS most-common-count")
    shortdict_num = int(shortdict_tresh)
    shortdict = dict((key,value) for key, value in sorted_dictionary.items() if value > shortdict_num)
    '''

    shortdict = dict((key, value) for key, value in sorted_dictionary.items())
    #print('shortdict... ')
    #print(str(shortdict))
    dict_title = str(shortdict)
    #
    # take most-Common count and calcualte weighted score per string in lemma-dataArray...
    processList = []
    for row in dataArray:
        #print(row)
        count = 0
        ratingCounter = 0
        row = row.split()
        for word in row:
            #print(word)
            for key, value in shortdict.items():
                if word == key:
                    count = count + value
        processList.append(count)
    #print('processList... ')
    #print(processList)
    return dict_title, processList

コード例 #22

0

ファイルを表示

#!/usr/bin/python3

import nltk.corpus as corpus
from nltk.corpus import udhr
from nltk.corpus import swadesh

text = udhr.sents('Spanish-Latin1')
es = swadesh.words('es')
spanish_to_english = swadesh.entries(['es', 'en'])
trans = dict(spanish_to_english)

for sentence in text:
    for i in range(len(sentence)):
        if sentence[i] in es:
            print(trans[sentence[i]], end=' ')
        else:
            print("UNK", end=' ')
    print('')

コード例 #23

0

ファイルを表示

cfd.plot()

entries = nltk.corpus.cmudict.entries()
len(entries)

for entry in entries[42371:42379]:
    print(entry)

syllable = ['N', 'IHO', 'K', 'S']
[word for word, pron in entries if pron[-4:] == syllable]

[w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']

from nltk.corpus import swadesh
swadesh.fileids()
swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
fr2en
translate = dict(fr2en)
translate['chien']
translate['jeter']

de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
translate['Hund']
translate['perro']

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']

コード例 #24

0

ファイルを表示

ファイル: comparatie.4.3.py プロジェクト: loveclj/python

__author__ = 'lizhifeng'
from nltk.corpus import  swadesh

print swadesh.fileids()
print swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
print fr2en

translate = dict(fr2en)
print translate["chien"]

コード例 #25

0

ファイルを表示

# An example of a tabular lexicon is the comparative wordlist. NLTK includes so-called Swadesh wordlists, lists of about 200 common words in several languages. The Swadesh list is used in the quantitative assessment of the genealogical relatedness of languages. 

from nltk.corpus import swadesh

print(swadesh.fileids()) 

# prints out the language identifiers (two-letter code).
print() # prints out an empty line.

print(swadesh.words("de"))

# prints out 200 common German words from swadesh.

コード例 #26

0

ファイルを表示

ファイル: words.py プロジェクト: balak1986/natural-language-processing

def get_frequncy_dist(dir_path):
    files = os.listdir(dir_path)

    all_words = 0
    words_wt_freq = {}   
    '''get words'''
    for filename in files:
        if (filename.endswith('.srt')):
            file_handler = open(dir_path + '\\' + filename, 'r')
            for line in file_handler :
                for word in line.strip().split():
                    sword = word.strip(punctuation)
                    if (sword.isalpha()):
                        lword = sword.lower()
                        words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1
                        all_words += 1
            file_handler.close()
    logger.debug('# all words: ' + str (all_words - 1))
    logger.debug('# unique words: ' + str (len(words_wt_freq.keys())))
    lexical_diversity_for_freq(words_wt_freq.values())
    
    lemmatized_words_wt_freq = {}
    for word in words_wt_freq.keys():
        lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word)
        if (word != lemmatized_word and lemmatized_word != None):
            lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word)
            #print(lemmatized_word, word)
        else:
            lemmatized_words_wt_freq[word] = words_wt_freq.get(word)
    lemmatized_size = len(lemmatized_words_wt_freq.keys())            
    logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size))
    lexical_diversity_for_freq(lemmatized_words_wt_freq.values())
    words_wt_freq = {} # Save memory

    
    stopwords_en = stopwords.words('english')
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    comparative = swadesh.words('en')
    ignore_list = [] ;
    ignore_list.extend(stopwords_en)
    ignore_list.extend(male_names)
    ignore_list.extend(female_names)
    ignore_list.extend(comparative)            
    filtered_words = []

    out_file = open(dir_path + '\\wfd.csv', 'w')
    out_file.write ('Word, Type, Frequency \n')
        
    for word in lemmatized_words_wt_freq.keys():
        if len(word) > 2 and word not in ignore_list:
            filtered_words.append(word)   
        else:
            out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words)))
    ignore_list = [] #save memory

    '''wordnet has 155k'''                                 
    usual_words = []
    for word in  filtered_words:
        if (len(wordnet.synsets(word)) != 0):
            usual_words.append(word)
        else:
            out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words)))
    filtered_words = [] # save memory 

    tag_filtered_words_wt_freq = {}
    words_wt_tags = nltk.pos_tag(usual_words)
    for (word, tag) in words_wt_tags:
        if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']):
            if(en.is_adverb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADV,' + word)
            elif (en.is_adjective(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADJ,' + word)
            elif (en.is_verb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('VB,' + word)
            elif (en.is_noun(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('N,' + word) 
            else:
                if (tag in ['VBZ', 'NNS']):
                    if word.endswith('s'):
                        new_word = word[:-1]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                        #print (word , new_word,tag)    
                elif (tag == 'VBG'):
                    new_word = en.verb.infinitive(word)
                    if new_word != None and word != new_word:
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                elif (tag == 'JJS'):
                    if word.endswith('est'):
                        new_word = word[:-3]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)     
                else:
                    tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]        
                    #print (word,tag)   
        else:
            out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys())))
    lexical_diversity_for_freq(tag_filtered_words_wt_freq.values())
    lemmatized_words_wt_freq = {} # save memory
    usual_words = [] #save memory

    basic_english_vocab = en.basic.words
    non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab)
    non_basic_words_wt_freq = {}
    for non_basic_word in non_basic_words:
        non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] 
    words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys())))
    lexical_diversity_for_freq(non_basic_words_wt_freq.values())
    tag_filtered_words_wt_freq = {} #save memory


    fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r')
    my_words = [word.lower() for line in fh for word in line.strip().split()]
    fh.close()
    new_words = set(non_basic_words).difference(my_words)
    words_in_both = set(non_basic_words).intersection(my_words)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n')    
    new_words_wt_freq = {}
    for new_word in new_words:
        new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] 
    logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys())))
    lexical_diversity_for_freq(new_words_wt_freq.values())
    
    sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0))
    for (word, frequency) in sorted_words:
        out_file.write (word + ',lexicon,' + str(frequency) + '\n')
    out_file.close()
    
    return new_words_wt_freq