def make_and_save_lookup_tagger(fname): fd_tagged_words = nltk.ConditionalFreqDist(cess_esp.tagged_words()) likely_tags = dict((word, fd_tagged_words[word].max()) for word in cess_esp.words()) lookup_tagger = nltk.UnigramTagger(model=likely_tags) output=open(fname, 'wb') dump(lookup_tagger, output, -1) output.close()
def make_and_save_most_common_words_lookup_tagger(fname, number): fd_words = nltk.FreqDist(cess_esp.words()) fd_tagged_words = nltk.ConditionalFreqDist(cess_esp.tagged_words()) most_common_words = fd_words.most_common(number) most_common_words = [item[0] for item in most_common_words] likely_tags = dict((word, fd_tagged_words[word].max()) for word in most_common_words) lookup_tagger = nltk.UnigramTagger(model=likely_tags) output=open(fname, 'wb') dump(lookup_tagger, output, -1) output.close()
#! /usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import cess_esp #Pregunta 1.a #forma larga print "pregunta 1a" etiquetado = cess_esp.tagged_words() etiquetas = set(tag for (word, tag) in etiquetado) print etiquetas #otra forma simplificada etiquetado = cess_esp.tagged_words(simplify_tags=True) etiquetas = set(tag for (word, tag) in etiquetado) print etiquetas #pregunta 1.b print "pregunta 1b" for field in cess_esp.fileids(): vocabulario = set([w.lower() for w in cess_esp.words(field)]) print vocabulario #Pregunta 1.c print "pregunta 1c" etiquetado = cess_esp.tagged_words() for i in etiquetado: print i[0], " ", i[1] #Pregunta 1.d print "pregunta 1d" t = cess_esp.parsed_sents()[0] print t #Pregunta 2 print "pregunta 2" from xml.dom import minidom
import random from nltk.corpus import movie_reviews from nltk.classify.scikitlearn import SklearnClassifier import pickle from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify import ClassifierI from statistics import mode from nltk.tokenize import word_tokenize #adaptacion SPANISH from nltk.corpus import cess_esp nltk.tag.mapping._load_universal_map("es-cast3lb") mapdict = nltk.tag.mapping._MAPPINGS["es-cast3lb"]["universal"] alltags = set(t for w, t in cess_esp.tagged_words()) for tag in alltags: if len(tag) <= 2: # These are complete continue mapdict[tag] = mapdict[tag[:2]] cess_esp._tagset = "es-cast3lb" from nltk import UnigramTagger as ut from nltk import BigramTagger as bt cess_sents = cess_esp.tagged_sents(tagset='universal') uni_tag = ut(cess_sents, backoff=nltk.DefaultTagger('X')) class VoteClassifier(ClassifierI): def __init__(self, *classifiers): self._classifiers = classifiers
class SpanishCorpus: """ Class SpanishCorpus to ease text mining in spanish. The objective of this library is generate a clean corpus of words based on a text in spanish. Attributes: _text: Original text provided in the initialization _tokens: Stores the result of the different filter functions _analysis: List of tuples with de lexical analysis result _corrected_words: List of corrected words _synonyms: List of sets of synonyms of every word in tokens _fdist: Instance of nltk.FreqDist _timing: True if you want to timing the methods The class functions are in the logical order to run """ word_tag_fd = FreqDist(cess_esp.tagged_words()) levenshtein_distance = 1 def __init__(self, text, timing=False): """ :param text: Original text :param timing: True if you want timing the methods """ self._text = text self._tokens = None self._analysis = None self._corrected_words = {} self._synonyms = None self._fdist = None self._timing = timing @property def text(self): return self._text @text.setter def text(self, value): self._text = value @property def tokens(self): return self._tokens @tokens.setter def tokens(self, value): self._tokens = value @property def analysis(self): return self._analysis @analysis.setter def analysis(self, value): self._analysis = value @property def synonyms(self): return self._synonyms @synonyms.setter def synonyms(self, value): self._synonyms = value @property def fdist(self): return self._fdist.items() @fdist.setter def fdist(self, value): self._fdist = value @property def corrected_words(self): return self._corrected_words @corrected_words.setter def corrected_words(self, value): self._corrected_words = value def timing(method): """ Decorator that allows to time the execution of a function """ def timed(self, *args, **kwargs): if self._timing: t_start = time.time() result = method(self, *args, **kwargs) t_end = time.time() print('{0} --- {1} sec'.format( method.__name__.ljust(25, str(' ')), t_end - t_start)) else: result = method(self, *args, **kwargs) return result return timed @timing def tokenize(self): """ Converts a text into a list of words :return: Tokens """ self._tokens = word_tokenize(self._text) return self._tokens @timing def clean(self): """ Minimises words and filters not completely alpha words of tokens :return: Tokens """ if self._tokens is None: raise Exception('It\'s necessary execute first tokenize') self._tokens = [ word.lower() for word in self._tokens if word.isalpha() and len(word) > 2 ] return self._tokens @timing def filter_stop_words(self): """ Filters stopwords of tokens :return: Tokens """ if self._tokens is None: raise Exception('It\'s necessary execute first tokenize') spanish_stopwords = stopwords.words('spanish') self._tokens = [ word for word in self._tokens if word not in spanish_stopwords ] return self._tokens @classmethod def levenshtein(cls, s1, s2): """ Calculates the Levenshtein's distance between two words :param s1: :param s2: Words to compare :return: Number of differences """ if len(s1) < len(s2): return SpanishCorpus.levenshtein(s2, s1) if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] def correct_word(self, token): """ Correct a word using enchant and the dictionary Nltk.cess_esp and the Levenshtein's distance :param token: Word to correct :return similar_word: Word closest """ if token in self._corrected_words: return self._corrected_words[token] suggested = (enchant.Dict('es')).suggest(token) if len(suggested) > 0: for similar_word in suggested: if SpanishCorpus.levenshtein( token, similar_word) <= SpanishCorpus.levenshtein_distance: self._corrected_words[token] = similar_word print u'--> Palabra corregida: {} --> {}'.format( token, similar_word) return similar_word minimum = sys.maxint similar_word = '' for word in cess_esp.words(): lev_dist = SpanishCorpus.levenshtein(token, word) if (lev_dist < minimum) or (lev_dist == minimum and len(token) == len(word) and len(similar_word) != len(token)): minimum = lev_dist similar_word = word if lev_dist == 0: break if minimum <= SpanishCorpus.levenshtein_distance: self._corrected_words[token] = similar_word print u'--> Palabra corregida: {} --> {}'.format( token, similar_word) return similar_word else: return None def check_category_nltk(self, token, index): """ Detects the word's category using Nltk library. :param token: Word to check category :param index: Word's index in tokens :return category: Word's grammar category """ category = None for (wt, _) in SpanishCorpus.word_tag_fd.most_common(): if token == wt[0]: category = wt[1].ljust(7, '0') if index >= len(self._analysis): self._analysis.append([token, category]) else: self._tokens[index] = token self._analysis[index] = [token, category] break return category def check_category_pattern(self, token, index): """ Detects the word's category using Pattern library (when pattern library does't know the word it says that the word is a noun). :param token: Word to check category :param index: Word's index in tokens :return category: Word's grammar category """ category = parse(token) if '/NN' in category: category = 'n' elif '/VB' in category: category = 'v' elif '/JJ' in category: category = 'a' elif '/CC' or '/CS' in category: category = 'c' elif '/P' in category: category = 'p' else: category = '-' if index >= len(self._analysis): self._analysis.append([token, category.ljust(7, '0')]) else: self._tokens[index] = token self._analysis[index] = [token, category.ljust(7, '0')] return category def analize_word(self, token, index, to_correct): """ Categorizes lexically a word. Initially it uses Nltk, if it doesn't find the word's category it tries with Pattern library. If doesn't work and the word not is a foreign word it tries to correct the word with enchant and cess_esp :param token: Word to analize :param index: Word's index in tokens :param to_correct: Indicates if it will try to correct word """ category = self.check_category_nltk(token, index) if not category: category = self.check_category_pattern(token, index) if to_correct and category == 'n' and any(c in ['a', 'e', 'i', 'o', 'u'] for c in token) \ and not enchant.Dict('en').check(token) and not enchant.Dict('fr').check(token) \ and not enchant.Dict('de_DE').check(token): new_token = self.correct_word(token) if new_token and new_token != token: self.analize_word(new_token, index=index, to_correct=False) @timing def analize(self, to_correct): """ Returns a list of tuples of lexical analysis of tokens :param to_correct: Indicates if it will try to correct twords :return: Result of analysis """ if self._tokens is None: raise Exception('It\'s necessary execute first tokenize') self._analysis = [] for i in range(len(self._tokens)): token = self._tokens[i] self.analize_word(token, index=i, to_correct=to_correct) return self._analysis @timing def clean_post_analysis(self): """ Filters determinants, pronouns and conjunctions of tokens :return tokens """ if self._analysis is None: raise Exception('It\'s necessary execute first analize') new_tokens = [] new_analysis = [] new_synonyms = [] for i in range(len(self._tokens)): if self._analysis[i][1][0] != 'd' \ and self._analysis[i][1][0] != 'p' \ and self._analysis[i][1][0] != 'c': new_tokens.append(self._tokens[i]) new_analysis.append(self._analysis[i]) if self._synonyms: new_synonyms.append(self.synonyms[i]) self._tokens = new_tokens self._analysis = new_analysis self._synonyms = new_synonyms if self._fdist: self.calculate_frequencies() return self._tokens @timing def unify_tokens(self): """ Singuralizes nouns, conjugates verbs to infinitive and passes adjectives to predicative form in tokens :return: Tokens """ if self._analysis is None: raise Exception('It\'s necessary execute first analize') for i in range(len(self._tokens)): if self._analysis[i][1][0] == 'n': self._tokens[i] = singularize(self._tokens[i]) elif self._analysis[i][1][0] == 'v': self._tokens[i] = conjugate(self._tokens[i], INFINITIVE) elif self._analysis[i][1][0] == 'a': self._tokens[i] = predicative(self._tokens[i]) return self._tokens @timing def synonymize(self): """ Returns a list of sets of synonyms of every word in tokens. Only searchs synonyms of nouns and verbs :return: Synonyms """ if self._analysis is None: raise Exception('It\'s necessary execute first analize') self._synonyms = [] for i in range(len(self._tokens)): if self._analysis[i][1][0] == 'n': synsets = wordnet.synsets(self._tokens[i], pos=wordnet.NOUN, lang='spa') elif self._analysis[i][1][0] == 'v': synsets = wordnet.synsets(self._tokens[i], pos=wordnet.VERB, lang='spa') else: synsets = None synonyms = [] if synsets: for j in range(len(synsets)): synset = synsets[j].lemma_names('spa') for synonym in synset: if synonym != self._tokens[ i] and synonym not in synonyms: synonyms.append(synonym) self._synonyms.append([self._tokens[i], synonyms]) return self._synonyms @timing def calculate_frequencies(self): """ Returns a list of tuples where every word in tokens has its frequency of occurence :return: Frequencies """ if self._tokens is None: raise Exception('It\'s necessary execute first tokenize') self._fdist = FreqDist(self._tokens) return self._fdist.items() def return_to_text(self): """ Returns a string with the concatenation of tokens with spaces :return: Text """ text = '' for token in self._tokens: text = '{} {}'.format(text, token) return text def show_results(self): """ Shows the results of the study of corpus """ print '***************** RESULTS *****************' print '1.- Original text: ' print self._text print '*******************************************' if self._tokens: print '2.- Tokens: ' print self._tokens print '*******************************************' if self._analysis: print '3.- Analysis: ' print self._analysis print '*******************************************' if self._synonyms: print '4.- Synonyms: ' print self._synonyms print '*******************************************' if self._fdist: print '5.- Frecuencies: ' print self._fdist.items() print '*******************************************'
from glob import glob from codecs import open, BOM_UTF8 from collections import defaultdict from nltk.corpus import cess_esp # "el" => {"DA": 3741, "NP": 243, "CS": 13, "RG": 7}) lexicon = defaultdict(lambda: defaultdict(int)) for tag in cess_esp.tagged_words()[:100]: lexicon[tag[0]][tag[1]] += 1 top = [] for w, tags in lexicon.items(): freq = sum(tags.values()) # 3741 + 243 + ... tag = max(tags, key=tags.get) # DA top.append((freq, w, tag)) top = sorted(top, reverse=True)[:100] # top 100,000 top = ["%s %s" % (w, tag) for freq, w, tag in top if w] open("es-lexicon.txt", "w").write("\n".join(top)) ANONYMOUS = "anonymous" for s in cess_esp.tagged_words()[:100]: for i, (w, tag) in enumerate(s): if tag.startswith("NP"): # NP = proper noun in Parole tagset. s[i] = (ANONYMOUS, "NP")
import nltk from nltk.corpus import cess_esp nltk.download('conll2002') nltk.download('cess_esp') tagged_sentences = nltk.corpus.conll2002.tagged_sents() tagged_sentences1 = cess_esp.tagged_sents() print(tagged_sentences1[0]) print("Tagged sentences: ", len(tagged_sentences)) print("Tagged words:", len(nltk.corpus.conll2002.tagged_words())) print("Tagged sentences: ", len(tagged_sentences1)) print("Tagged words:", len(cess_esp.tagged_words())) from itertools import chain import nltk import sklearn import scipy.stats from sklearn.metrics import make_scorer import sklearn_crfsuite from sklearn_crfsuite import scorers from sklearn_crfsuite import metrics def features(sentence, index): """ sentence: [w1, w2, ...], index: the index of the word """ return { 'word': sentence[index], 'is_first': index == 0,
#! /usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import cess_esp #Pregunta 1.a #forma larga print "pregunta 1a" etiquetado=cess_esp.tagged_words() etiquetas=set(tag for (word,tag) in etiquetado) print etiquetas #otra forma simplificada etiquetado=cess_esp.tagged_words(simplify_tags=True) etiquetas=set(tag for (word,tag) in etiquetado) print etiquetas #pregunta 1.b print "pregunta 1b" for field in cess_esp.fileids(): vocabulario = set([w.lower() for w in cess_esp.words(field)]) print vocabulario #Pregunta 1.c print "pregunta 1c" etiquetado=cess_esp.tagged_words() for i in etiquetado: print i[0]," ",i[1] #Pregunta 1.d print "pregunta 1d" t=cess_esp.parsed_sents()[0] print t #Pregunta 2 print "pregunta 2" from xml.dom import minidom
"""Convenience function to turn a tagged text into a readable string.""" return ' '.join('{} ({})'.format(word, tag) for word, tag in tagged_text) def normalize_tag(tag): """Normalize a single tag from the cess_esp tagset. This just chops off the semantic annotation. """ newTag = tag[0] #this removes everything except the basic POS if newTag == "F": newTag = "" #removes punctuation return newTag def percentage_correct(my_tags, correct_tags): return 100 * ( 1 - (Tagger.compare_texts(my_tags, correct_tags) / len(correct_tags))) # load the taggers from file try: brown_tagger = Tagger.load('brown.tag') except IOError: brown_tagger = Tagger(brown.tagged_words()) brown_tagger.save('brown.tag') try: cess_tagger = Tagger.load('cess.tag') except IOError: cess_tagger = Tagger(cess_esp.tagged_words()) cess_tagger.save('cess.tag')