Example #1
0
 def correct_word(self, token):
     """
     Correct a word using enchant and the dictionary Nltk.cess_esp and the Levenshtein's distance
     :param token: Word to correct
     :return similar_word: Word closest
     """
     if token in self._corrected_words:
         return self._corrected_words[token]
     suggested = (enchant.Dict('es')).suggest(token)
     if len(suggested) > 0:
         for similar_word in suggested:
             if SpanishCorpus.levenshtein(token, similar_word) <= SpanishCorpus.levenshtein_distance:
                 self._corrected_words[token] = similar_word
                 print u'--> Palabra corregida: {} --> {}'.format(token, similar_word)
                 return similar_word
     minimum = sys.maxint
     similar_word = ''
     for word in cess_esp.words():
         lev_dist = SpanishCorpus.levenshtein(token, word)
         if (lev_dist < minimum) or (lev_dist == minimum and
                                             len(token) == len(word) and len(similar_word) != len(token)):
             minimum = lev_dist
             similar_word = word
             if lev_dist == 0:
                 break
     if minimum <= SpanishCorpus.levenshtein_distance:
         self._corrected_words[token] = similar_word
         print u'--> Palabra corregida: {} --> {}'.format(token, similar_word)
         return similar_word
     else:
         return None
Example #2
0
    def __init__(self, language):
        self.language = language
        # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
        if language == 'english':
            self.avg_word_length = 5.3
            text = brown.words()

        else:  # spanish
            self.avg_word_length = 6.2
            text = cess_esp.words()

        if language == 'english':   
            self.fdist = nltk.FreqDist(w for w in text)
        else:
            self.fdist = nltk.FreqDist(w.lower() for w in text)
        
        self.total = len(text)

        #models     
        self.model1 = MLPClassifier(random_state=2)
        self.model2 = svm.SVC(random_state=2)
        self.model4 = RandomForestClassifier(random_state=2) 
        self.model5 = LogisticRegression(random_state=2)

        #hard voting classifier 
        if language == 'spanish':
            estimators = [('mlp', self.model1), ('rf', self.model4), ('lr', self.model5)]
        else:
            estimators = [('svc', self.model2), ('rf', self.model4), ('mlp', self.model1)]

        self.vote = VotingClassifier(estimators, voting='hard')
def make_and_save_lookup_tagger(fname):
    fd_tagged_words = nltk.ConditionalFreqDist(cess_esp.tagged_words())
   
    likely_tags = dict((word, fd_tagged_words[word].max()) for word in cess_esp.words()) 
    lookup_tagger = nltk.UnigramTagger(model=likely_tags) 
    
    output=open(fname, 'wb')
    dump(lookup_tagger, output, -1)
    output.close()
Example #4
0
 def create_espBoWLexicon(self):  # CESS esp corpus
     BoW = Counter()
     lexicon = {}
     for word in cess_esp.words():
         BoW[word] += 1.
     threshold = stats.scoreatpercentile(list(BoW.values()), 10)
     for word, count in BoW.items():
         if count >= threshold:
             lexicon[word] = count
     return BoW, lexicon
def make_and_save_most_common_words_lookup_tagger(fname, number):
    fd_words = nltk.FreqDist(cess_esp.words())
    fd_tagged_words = nltk.ConditionalFreqDist(cess_esp.tagged_words())
   
    most_common_words = fd_words.most_common(number)
    most_common_words = [item[0] for item in most_common_words]
    likely_tags = dict((word, fd_tagged_words[word].max()) for word in most_common_words) 
    lookup_tagger = nltk.UnigramTagger(model=likely_tags) 
    
    output=open(fname, 'wb')
    dump(lookup_tagger, output, -1)
    output.close()
Example #6
0
    def __init__(self, language):
        self.language = language
        # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
        if language == 'english':
            self.avg_word_length = 5.3
            text = brown.words()

        else:  # spanish
            self.avg_word_length = 6.2
            text = cess_esp.words()


        self.fdist = nltk.FreqDist(w.lower() for w in text)
        self.model = KNeighborsClassifier()
        self.total = len(text)
Example #7
0
def descargarCorpus():
    print(
        "   Fase 2/6  Descargando Corpus..................espere un momento \n"
    )
    '''
	sino="n"
	sino=raw_input("    ===> Descargar el DATASET del Corpus S/N ?  ")
	sino=sino.lower()
	if (sino=="s"):
		print("\n iniciando la descarga...... espere un momento \n")
		#nltk.download("movie_reviews")
		nltk.download("spanish_grammars")
		nltk.download("cess_esp")
		print("\n  dataset descargado \n")
	if (sino=="n"):
		print("\n  procesando dataset en memoria \n")
		reviews = [(list(movie_reviews.words(fileid)), category)
		for category in movie_reviews.categories()
		for fileid in movie_reviews.fileids(category)]
		new_train, new_test = reviews[0:100], reviews[101:200]
		print(new_train[0])
		print("\n\n")
	'''
    print(
        "\n  procesando dataset en memoria.... espere un momento por favor \n")
    '''
	reviews = [(list(movie_reviews.words(fileid)), category)
	for category in movie_reviews.categories()
	for fileid in movie_reviews.fileids(category)]
	new_train, new_test = reviews[0:100], reviews[101:200]
	print(new_train[0])
	print("\n\n")
	'''
    reviews = list(cess_esp.words()
                   )  #reviews = [(list(cess_esp.words(fileid)), category)]
    new_train, new_test = reviews[0:100], reviews[101:200]
    print("\n        procesando dataset en memoria del cees_esp \n\n")
    #print(str(reviews))
    print("\n\n")
    print(new_train)
    print("\n\n      Test...... \n")
    print(new_test)
    print("\n\n")
    return
 def correct_word(self, token):
     """
     Correct a word using enchant and the dictionary Nltk.cess_esp and the Levenshtein's distance
     :param token: Word to correct
     :return similar_word: Word closest
     """
     if token in self._corrected_words:
         return self._corrected_words[token]
     suggested = (enchant.Dict('es')).suggest(token)
     if len(suggested) > 0:
         for similar_word in suggested:
             if SpanishCorpus.levenshtein(
                     token,
                     similar_word) <= SpanishCorpus.levenshtein_distance:
                 self._corrected_words[token] = similar_word
                 print u'--> Palabra corregida: {} --> {}'.format(
                     token, similar_word)
                 return similar_word
     minimum = sys.maxint
     similar_word = ''
     for word in cess_esp.words():
         lev_dist = SpanishCorpus.levenshtein(token, word)
         if (lev_dist < minimum) or (lev_dist == minimum
                                     and len(token) == len(word)
                                     and len(similar_word) != len(token)):
             minimum = lev_dist
             similar_word = word
             if lev_dist == 0:
                 break
     if minimum <= SpanishCorpus.levenshtein_distance:
         self._corrected_words[token] = similar_word
         print u'--> Palabra corregida: {} --> {}'.format(
             token, similar_word)
         return similar_word
     else:
         return None
Example #9
0
            'English: Wall Street Journal Corpus':
                lambda: treebank.words(),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.words(),
            'Dutch: Alpino Corpus':
                lambda: alpino.words(),
            'Hindi: Indian Languages Corpus':
                lambda: indian.words(files='hindi.pos'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.words(),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.words(),
            'Portuguese: Machado Corpus (Brazil)':
                lambda: machado.words(),
            'Spanish: CESS-ESP Corpus':
                lambda: cess_esp.words()
           }

class CollocationsView:
    _BACKGROUND_COLOUR='#FFF' #white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
        self.after = self.top.after(POLL_INTERVAL, self._poll)
 def test_esp(self):
     words = cess_esp.words()[:15]
     txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
     self.assertEqual(words, txt.split())
     self.assertEqual(cess_esp.words()[115], "años")
Example #11
0
import nltk
from nltk.corpus import cess_esp
#Pregunta 1.a
#forma larga
print "pregunta 1a"
etiquetado=cess_esp.tagged_words()
etiquetas=set(tag for (word,tag) in etiquetado)
print etiquetas
#otra forma simplificada
etiquetado=cess_esp.tagged_words(simplify_tags=True)
etiquetas=set(tag for (word,tag) in etiquetado)
print etiquetas
#pregunta 1.b
print "pregunta 1b"
for field in cess_esp.fileids():
  vocabulario = set([w.lower() for w in cess_esp.words(field)])
print vocabulario
#Pregunta 1.c
print "pregunta 1c"
etiquetado=cess_esp.tagged_words()
for i in etiquetado:
  print i[0]," ",i[1]
#Pregunta 1.d
print "pregunta 1d"
t=cess_esp.parsed_sents()[0]
print t
#Pregunta 2
print "pregunta 2"
from xml.dom import minidom
dom=minidom.parse("/home/javier/ALC/python/frase_ancora.xml")
nodes=dom.childNodes
    def __init__(self, language, trainset):
        self.language = language
        pos_tags_nltk = [
            'cc',
            'cd',
            'dt',
            'in',
            'jj',
            'jjr',
            'jjs',
            'nn',
            'nns',  # tag
            'nnp',
            'nnps',
            'pdt',
            'pos',
            'prp',
            'prp$',
            'rb',
            'rbr',
            'rbs',
            'rp',
            'sym',
            'vb',
            'vbd',
            'vbg',
            'vbn',
            'vbp',
            'vbz',
            'wdt',
            'wp',
            'wp$',
            'wrb'
        ]
        self.vowels = ['a', 'e', 'i', 'o', 'u']
        self.vec = CountVectorizer(vocabulary=pos_tags_nltk)  # tag vector

        self.cmudict = nltk.corpus.cmudict.dict()  #syl dict

        self.model = xgb.XGBClassifier(
            learning_rate=0.1,  # XGboost classifier
            eta=1,
            silent=1,
            nround=10,
            n_estimators=1000,
            max_depth=6,
            min_child_weight=1,
            gamma=0.1,
            reg_alpha=0.005,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=8,
            scale_pos_weight=1,
            seed=27)
        self.word_fre = {}
        self.bigram_fre = {}
        self.trigram_fre = {}
        self.tetgram_fre = {}
        for row in trainset:
            sen = row["sentence"]
            target_word = row["target_word"]
            for word in target_word.split(' '):
                if word in self.word_fre:
                    self.word_fre[word] += 1
                else:
                    self.word_fre[word] = 1
            for word in sen.split(' '):
                for i in range(len(word) - 1):
                    if word[i:i + 2] in self.bigram_fre:
                        self.bigram_fre[word[i:i + 2]] += 1
                    else:
                        self.bigram_fre[word[i:i + 2]] = 0
                for i in range(len(word) - 2):
                    if word[i:i + 3] in self.trigram_fre:
                        self.trigram_fre[word[i:i + 3]] += 1
                    else:
                        self.trigram_fre[word[i:i + 3]] = 0

        if self.language == "english":
            self.avg_word_length = 5.3
            brown_corpus = brown.categories()  # brown corpus
            for i in range(len(brown_corpus)):
                file = brown.words(categories=brown_corpus[i])
                for word in file:
                    if word not in self.word_fre:
                        self.word_fre[word] = 1
                    else:
                        self.word_fre[word] += 1
        else:
            self.avg_word_length = 6.2
            word = cess_esp.words()  # spanil corpus

            for item in word:
                if item in self.word_fre:
                    self.word_fre[item] += 1
                else:
                    self.word_fre[item] = 1
Example #13
0
def test_esp():
    words = cess_esp.words()[:15]
    txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
    self.assertEqual(words, txt.split())
Example #14
0
#! -*- encoding: utf8 -*-

#Acceder al corpus en castellano cess_esp
from nltk.corpus import cess_esp
from nltk.probability import *
import unittest
from operator import itemgetter

#Mostrar el número de palabras que contiene este corpus
print("\n\nEJERCICIO 1\n")
print(
    "\n--------------------------------------------------------------------------------\n "
)
cess_esp.words()
palabras = len(cess_esp.words())
print("1.1) Cargando Corpus cess_esp...")
print(
    "\n--------------------------------------------------------------------------------\n "
)
print("\n\n1.2) Numero de palabras que contiene el corpus: \n" + str(palabras))

#Obtener oraciones del corpus:
frases = cess_esp.sents()
numFrases = len(cess_esp.sents())
print(
    "\n--------------------------------------------------------------------------------\n "
)
print("\n\n1.3) Numero de frases que contiene el corpus: \n" + str(numFrases))

nomFichero = cess_esp.fileids()[0]
texto = cess_esp.words(nomFichero)
Example #15
0
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
        self.after = self.top.after(POLL_INTERVAL, self._poll)
Example #16
0
        print("!! NOT FOUND IN ANY LIST")

botorhuman=np.array(train_bt).flatten()
botorhuman_dev=np.array(dev_bt).flatten()

gender = np.array(train_g). flatten()
gender_dev = np.array(dev_g). flatten()

### HELPER FUNCTIONS ###

# a pos tageken kívül ez van még:
# token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop

nlp = es_core_news_sm.load()
stop = stopwords.words('spanish')
word_list = cess.words()
word_set = set(word_list)
senti_clf = SentimentClassifier()


def PosTagger(text):
    doc = nlp(text)
    pos = []
    for token in doc:
        pos.append(token.pos_)
    counter = collections.Counter(pos)
    try:
        noun = counter.get('NOUN')/len(doc)
    except:
        noun = 0
    try:
Example #17
0
#Ramon Ruiz Dolz
#Salvador Marti Roman

from nltk.corpus import cess_esp
from nltk.corpus import PlaintextCorpusReader
from nltk.probability import *
import os
import nltk
dir_path = os.path.dirname(os.path.realpath(__file__))
corpus_root = dir_path.replace(".idea", "")
nltk.data.path.append(dir_path + "\\NLTK")
#EJERCICIO1
print("#act2")
print(len(cess_esp.words()))

print("#act3")
print(len(cess_esp.sents()))

print("#act4")
text = cess_esp.words(cess_esp.fileids()[0])
fdist = FreqDist(text)
print(fdist.most_common(20))

print("#act5")
voc = [w for w, f in fdist.most_common()]
print(voc)

print("#act6")
print(list(w for w in voc if len(w) > 7 and fdist[w] > 2))

print("#act7")
Example #18
0
import nltk
from nltk.corpus import cess_esp
#Pregunta 1.a
#forma larga
print "pregunta 1a"
etiquetado = cess_esp.tagged_words()
etiquetas = set(tag for (word, tag) in etiquetado)
print etiquetas
#otra forma simplificada
etiquetado = cess_esp.tagged_words(simplify_tags=True)
etiquetas = set(tag for (word, tag) in etiquetado)
print etiquetas
#pregunta 1.b
print "pregunta 1b"
for field in cess_esp.fileids():
    vocabulario = set([w.lower() for w in cess_esp.words(field)])
print vocabulario
#Pregunta 1.c
print "pregunta 1c"
etiquetado = cess_esp.tagged_words()
for i in etiquetado:
    print i[0], " ", i[1]
#Pregunta 1.d
print "pregunta 1d"
t = cess_esp.parsed_sents()[0]
print t
#Pregunta 2
print "pregunta 2"
from xml.dom import minidom
dom = minidom.parse("/home/javier/ALC/python/frase_ancora.xml")
nodes = dom.childNodes
import re

#nltk is the python library for Natural Language Processing (used here for cleaning non-English text from the data)
from nltk.corpus import brown
from nltk.corpus import words
from nltk.corpus import cess_esp as spanish
from nltk.corpus import reuters
from nltk.corpus import nps_chat

#These dictionaries are used to reduce time required to search for English words by implementing a hash search in "isEnglishWord"
englishBrownDict = dict.fromkeys(brown.words(), True)
englishWordsDict = dict.fromkeys(words.words(), True)
englishReutersDict = dict.fromkeys(reuters.words(), True)
englishChatDict = dict.fromkeys(nps_chat.words(), True)

spanishWordsDict = dict.fromkeys(spanish.words(), True)

malayText = open(os.path.join(os.getcwd(), "malayUpdated.txt"))
malayWordsDict = []

for line in malayText:

    malayWordsDict.append(line)

#print "Count of malay words: ", len (malayWords), "\n"
#malayWordsDict = dict.fromkeys (malayWords, True)

commonTweetWords = [
    "ur", "u", "youre", "gonna", "wanna", "wannabe", "shoulda", "should've",
    "coulda", "could've", "woulda", "would've", "thats", "that's", "whats",
    "what's", "hadnt", "hadn't", "couldnt", "couldn't", "wouldnt", "wouldn't",
Example #20
0
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
tokenizer = RegexpTokenizer(r'\w+')


def remove_stopwords(text, language="english"):
    stopwordsw = stopwords.words(language)
    result = [w for w in text if w.lower() not in stopwordsw]
    return result


# Ejercicio 1
# Mostrar el número de palabras que contiene este corpus
print("El numero de palabras en este corpus es %d" % len(cess_esp.words()))
# Mostrar el número de frases que contiene
print("El número de oraciones en este corpues es %d" % len(cess_esp.sents()))
"""
Obtener las frecuencias de aparición de los ítems que componen el primer fichero del corpus
anterior. Un ítem es un par (key, value) donde key es la palabra y value es la frecuencia de
aparición de la palabra. Visualizar los 20 más frecuentes.
"""
text = cess_esp.words(cess_esp.fileids()[0])

fdist = FreqDist(text)
print(fdist.most_common(20))
# Obtener el vocabulario del primer fichero del corpus (ordenado por frecuencia).
print("vocabulario ordenado por frecuencia")
p = [w for w, f in fdist.most_common()]
print(p)
Example #21
0
from nltk.corpus import cess_esp
from nltk.probability import *

firstfile = cess_esp.words(cess_esp.fileids()[0])
fdist = FreqDist(firstfile)
print(fdist.most_common(20))
Example #22
0
# Exercise 1

# 1
from nltk.corpus import cess_esp
# 2
print("2.", len(cess_esp.words()))
# 3
print("3.", len(cess_esp.sents()))
# 4
from nltk.probability import FreqDist

first_file = cess_esp.fileids()[0]
cess_freq0 = FreqDist(cess_esp.words(first_file))
print("4.", cess_freq0.most_common(20))
# 5
print("5.", [w for w, k in cess_freq0.most_common()])
# 6
print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2])
# 7
print("7.", [k for w, k in cess_freq0.most_common()])
print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0))
# 8
print("8. No de palabras que aparecen una sola vez:",
      len([w for w, k in cess_freq0.items() if k == 1]))
# 9
print("9. La palabra más frecuente es", cess_freq0.max())
# 10
from nltk.corpus import PlaintextCorpusReader

mycorpus = PlaintextCorpusReader("../res/", ".*")
# 11
    out = []
    i = len(s)
    while i > 0:
        c, k = best_match(i)
        assert c == cost[i]
        out.append(s[i - k:i])
        i -= k

    return " ".join(reversed(out))


link = set()
words = set(
    list(wd.words()) + list(brown.words()) + word_man + list(udhr.words()) +
    list(cess.words()))
some_variable = 0


def fcn(domain_data, pt, date):
    list_no = domain_data[1]
    forbids = [
        '[', '`', '\\', '-', '=', '~', '!', '@', '#', '$', '%', '^', '&', '*',
        '(', ')', '_', '+', '\\', '[', '\\', ']', '{', '}', ';', "'", '\\',
        ':', '"', '|', '<', ',', '.', '/', '<', '>', '?', ']'
    ]
    file = open('filtered_domains.txt', 'a')
    global words, link, some_variable, result_list, result_list_b, master_data
    domain = domain_data[0]
    inter = list(set(forbids) & set(domain.split(".")[0]))
    # FILTER 1
Example #24
0
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}


class CollocationsView:
    _BACKGROUND_COLOUR = '#FFF'  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
        self.after = self.top.after(POLL_INTERVAL, self._poll)
Example #25
0
    stringl = " ".join(map(str, listaimp))
    print(stringl)


def printfreq(listaimp):
    freqtoprint = FreqDist(listaimp)
    print(freqtoprint.most_common(20))


#Ejercicio 1

#1 Acceder al corpus en castellano cess_esp
from nltk.corpus import cess_esp as corpus

#2 Mostrar el número de palabras que contiene este corpus
print(len(corpus.words()))

#3 Mostrar el número de frases que contiene
print(len(corpus.sents()))

#4 Obtener las frecuencias de aparición de los ítems que componen el primer fichero del corpus
#anterior. Un ítem es un par (key, value) donde key es la palabra y value es la frecuencia de
#aparición de la palabra. Visualizar los 20 más frecuentes.
text1 = corpus.words(corpus.fileids()[0])
fdist = FreqDist(text1)
print(fdist.most_common(20))

#5 Obtener el vocabulario del primer fichero del corpus (ordenado por frecuencia)
#vocxfrec= sorted([(b,a) for a,b in sorted([(y,x) for x,y in fdist.keys()])])
#vocxfrec = sorted([key for key in sorted([(value, key) for key,value in fdist.most_common()])])
vocxfrec = [key for (key, value) in fdist.most_common()]
Example #26
0
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
        self.after = self.top.after(POLL_INTERVAL, self._poll)
Example #27
0
from nltk.corpus import cess_esp
from nltk.probability import FreqDist

fdist = FreqDist(cess_esp.words(cess_esp.fileids()[0]))
print("La palabra mas frecuente es ", fdist.max())
Example #28
0
import nltk
from nltk.corpus import cess_esp
import pylab
palabras = cess_esp.words()
palabras1 = palabras[:1000]
freqdist = nltk.FreqDist(palabras1)
freqdist.plot()
Example #29
0
from collections import Counter
import pickle
from nltk.corpus import brown, cess_esp
from nltk.util import ngrams
from collections import defaultdict

# Map words and the trigrams contained in them to their absolute frequency in the
# Brown Corpus (English) or the CESS_ESP corpus (Spanish)
if __name__ == '__main__':
    brown_words = brown.words()
    english_freqs = Counter(brown_words)
    esp_words = cess_esp.words()
    spanish_freqs = Counter(esp_words)
    with open('spanish_freqs.pkl', 'wb') as esp_f:
        pickle.dump(spanish_freqs, esp_f)
    with open('english_freqs.pkl', 'wb') as en_f:
        pickle.dump(english_freqs, en_f)

    en_trigrams = defaultdict(int)
    for word in brown_words:
        for trigram in [
                '{}{}{}'.format(t[0], t[1], t[2])
                for t in list(ngrams(word, 3))
        ]:
            en_trigrams[trigram] += 1
    with open('english_trigram_freqs.pkl', 'wb') as en_t_f:
        pickle.dump(en_trigrams, en_t_f)

    esp_trigrams = defaultdict(int)
    for word in esp_words:
        for trigram in [
Example #30
0
 def __init__(self, language):
     self.language = language
     # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
     if language == 'english':
         self.avg_word_length = 5.3
         # from Beker, Henry; Piper, Fred. Cipher Systems: The Protection of Communications.
         self.char_frequency = {
             'a': 8.167,
             'b': 1.492,
             'c': 2.782,
             'd': 4.253,
             'e': 12.702,
             'f': 2.228,
             'g': 2.015,
             'h': 6.094,
             'i': 6.966,
             'j': 0.153,
             'k': 0.772,
             'l': 4.025,
             'm': 2.406,
             'n': 6.749,
             'o': 7.507,
             'p': 1.929,
             'q': 0.095,
             'r': 5.987,
             's': 6.327,
             't': 9.056,
             'u': 2.758,
             'v': 0.978,
             'w': 2.360,
             'x': 0.150,
             'y': 1.974,
             'z': 0.074
         }
         self.dic = pyphen.Pyphen(lang='en')
         self.reuters = reuters.words()
         self.unigram_counts = Counter(self.reuters)
         bigrams = []
         for sent in reuters.sents():
             bigrams.extend(
                 nltk.bigrams(sent, pad_left=True, pad_right=True))
         self.bigram_counts = Counter(bigrams)
     else:  # spanish
         self.avg_word_length = 6.2
         # self.char_frequency = {'a': 12.525,'b': 2.215,'c': 4.139,'d': 5.860,'e': 13.681,
         #                        'f': 0.692,'g': 1.768,'h': 0.703,'i': 6.247,'j': 0.443,
         #                        'k': 0.011,'l': 4.967,'m': 3.157,'n': 6.71,'o': 8.683,
         #                        'p': 2.510, 'q': 0.877,'r': 6.871,'s': 7.977,'t': 4.632,
         #                        'u': 3.927, 'v': 1.138,'w': 0.017,'x': 0.215,'y': 1.008,
         #                        'z': 0.517,'á': 0.502, 'é': 0.433, 'í': 0.725, 'ñ': 0.311,
         #                        'ó': 0.827, 'ú': 0.168, 'ü': 0.012}
         # self.dic = pyphen.Pyphen(lang='es')
         self.cess = cess.words()
         self.unigram_counts = Counter(self.cess)
         bigrams = []
         for sent in cess.sents():
             bigrams.extend(
                 nltk.bigrams(sent, pad_left=True, pad_right=True))
         self.bigram_counts = Counter(bigrams)
     # self.clf = svm.SVC()
     # self.model = LogisticRegression()
     self.model = svm.SVC(gamma=5)
Example #31
0
import sys

if (len(sys.argv) == 1):
    print(
        'Ingrese el archivo del corpus como argumento. Ej: python createDictFromCorpus.py corpus.txt'
    )
elif sys.argv[1] == 'cess_esp':
    import json
    from nltk.corpus import cess_esp

    file = 'cess_esp'
    wordsToLower = map(lambda x: x.lower(), cess_esp.words())
    d = dict.fromkeys(set(wordsToLower), 1)
else:
    import os
    import json
    from nltk.corpus import PlaintextCorpusReader

    path, file = os.path.split(sys.argv[1])
    corpus = PlaintextCorpusReader(path, file)
    wordsToLower = map(lambda x: x.lower(), corpus.words())
    d = dict.fromkeys(set(wordsToLower), 1)

jsonF = json.dumps(d)
f = open(file + ".json", "w")
f.write(jsonF)
f.close()