def __init__(self,mwe=True): self.mwe = mwe # Train tagger if it's used for the first time. try: loadtagger('cess_unigram.tagger').tag(['estoy']) loadtagger('cess_bigram.tagger').tag(['estoy']) except IOError: print "*** First-time use of cess tagger ***" print "Training tagger ..." from nltk.corpus import cess_esp as cess cess_sents = cess.tagged_sents() traintag('cess',cess_sents) # Trains the tagger with no MWE. cess_nomwe = unchunk(cess.tagged_sents()) tagged_cess_nomwe = batch_pos_tag(cess_nomwe) traintag('cess_nomwe',tagged_cess_nomwe) print # Load tagger. if self.mwe == True: self.uni = loadtagger('cess_unigram.tagger') self.bi = loadtagger('cess_bigram.tagger') elif self.mwe == False: self.uni = loadtagger('cess_nomwe_unigram.tagger') self.bi = loadtagger('cess_nomwe_bigram.tagger')
def __init__(self, tsents=cess_esp.tagged_sents()): """ :param tsents: list of annotated sententeces """ self.__corpus = tsents self.__is_trained = False self.__tagger = None
def get_tagger(lang): if lang == "English": global eng_tagger if eng_tagger: return eng_tagger else: _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' eng_tagger = load(_POS_TAGGER) return eng_tagger elif lang == "Spanish": global spa_tagger if spa_tagger: return spa_tagger else: print 111 training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) print 555 return spa_tagger else: global cat_tagger if cat_tagger: return cat_tagger else: training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return cat_tagger
def annotate_pos(sentence, annotations): """Produce part-of-speech annotations for a source-language sentence.""" global TAGGER if TAGGER is None: logging.debug('Training part-of-speech tagger..') if os.path.isfile(TAGGER_FILENAME): with open(TAGGER_FILENAME, 'r') as f: TAGGER = pickle.load(f) else: training_data = cess_esp.tagged_sents() unigram_tagger = UnigramTagger(training_data) bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) TAGGER = BetterTagger(bigram_tagger) with open(TAGGER_FILENAME, 'w') as f: pickle.dump(TAGGER, f) parts_of_speech = TAGGER.tag([t.lower() for t in sentence]) annotations['pos'] = parts_of_speech return sentence, annotations
def train_and_save_spanish_tagger(): cess_tagged_sents=cess_esp.tagged_sents() tagger=nltk.UnigramTagger(cess_tagged_sents) fname='UnigramTagger_cess_esp.pk1' output=open(fname,'wb') dump(tagger,output,-1) output.close()
def get_tagger(lang): if lang == "English": global eng_tagger if eng_tagger: return eng_tagger else: _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' eng_tagger = load(_POS_TAGGER) return eng_tagger elif lang == "Spanish": global spa_tagger if spa_tagger: return spa_tagger else: training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return spa_tagger else: global cat_tagger if cat_tagger: return cat_tagger else: training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return cat_tagger
def ner(sent): cess_sents = cess.tagged_sents() # Train the unigram tagger # uni_tag = ut(cess_sents) # Tagger reads a list of tokens. # uni_tag.tag(sent.split(" ")) # Split corpus into training and testing set. # train = int(len(cess_sents)*90/100) # 90% # Train a bigram tagger with only training data. # bi_tag = bt(cess_sents[:train]) # Evaluates on testing data remaining 10% # bi_tag.evaluate(cess_sents[train+1:]) # Using the tagger. # bi_tag.tag(sent.split(" ")) res = [] common_word = pickle.load(open('spanish_words.pkl', 'rb')) for word in sent.split(): if word in common_word: res.append([word, common_word[word]]) return str(res)
def run(train, test, language, answer): results = {} if language == 'English': _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) elif language == 'Spanish': tagger = ut(cess_esp.tagged_sents()) elif language == 'Catalan': tagger = ut(cess_cat.tagged_sents()) for lexelt in train: train_features, y_train = extract_features(train[lexelt],language,tagger) test_features, _ = extract_features(test[lexelt],language,tagger) X_train, X_test = vectorize(train_features,test_features) X_train_new, X_test_new = feature_selection(X_train, X_test,y_train) results[lexelt] = classify(X_train_new, X_test_new,y_train) """ B1.c for lexelt in train: features = getBestWords(train[lexelt], 30) train_features = countFeature(features, train[lexelt]) _, y_train = extract_features(train[lexelt], language) test_features = countFeature(features, test[lexelt]) X_train, X_test = vectorize(train_features, test_features) results[lexelt] = classify(X_train, X_test, y_train) B1.c """ A.print_results(results, answer)
def __init__(self): print "Training POS tagger..." training = [] sents = cess_esp.tagged_sents() for i in range(len(sents)): training.append(sents[i]) self.tagger = nltk.tag.hmm.HiddenMarkovModelTagger.train(training) print "--Training complete--"
def __get_train_corpus(self): # TODO buscar un corpus de datos más completo sentences = cess.tagged_sents() print("Longitud sentencias: ", len(sentences)) if self.v >= 2 else None print("Sentencias: ", sentences) if self.v >= 3 else None tam = round(len(sentences) * .8) self.train_data = sentences[:tam] self.test_data = sentences[tam:]
def Spanish_tagger(): import nltk from nltk.corpus import cess_esp training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger("NOUN") bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return trigram_tagger
def __init__(self, train_percent_size=1): """ :param train_percent_size: 0-1 :return: """ spanish_sents = spa_corpus.tagged_sents() subset = subset_from_corpus(spanish_sents, train_percent_size) self._tagger = trained_tagger_with_corpus(subset)
def train_and_save_unigram_tagger(fname): #train nltk.UnigramTagger using #tagged sentences from cess_esp cess_tagged_sents = cess_esp.tagged_sents() tagger = nltk.UnigramTagger(cess_tagged_sents) #save the trained tagger in a file output = open(fname, 'wb') dump(tagger, output, -1) output.close()
def tonkenier(text): cess_sents = cess.tagged_sents() uni_tag = ut(cess_sents) words = text.replace(",", "").replace(".", "").replace("\n", "").replace("\t", "").split(" ") annotated_text = uni_tag.tag(words) return annotated_text
def __init__(self, use_mwe=False): self.use_mwe = use_mwe # Train tagger if it's used for the first time. try: load_tagger('cess_unigram.tagger').tag(['estoy']) load_tagger('cess_bigram.tagger').tag(['estoy']) except IOError: print("*** First-time use of cess tagger ***", file=sys.stderr) print("Training tagger ...", file=sys.stderr) # Load CESS corpus. cess_sents = cess.tagged_sents() train_tagger('cess', cess_sents) # Trains the tagger with no MWE. cess_nomwe = unchunk(cess.tagged_sents()) tagged_cess_nomwe = pos_tag_sents(cess_nomwe, False) train_tagger('cess_nomwe', tagged_cess_nomwe) # Load tagger. _mwe_option_name = "_nomwe_" if self.use_mwe == True else "_" self.uni = load_tagger('cess{}unigram.tagger'.format(_mwe_option_name)) self.bi = load_tagger('cess{}bigram.tagger'.format(_mwe_option_name))
def handling_negation(self, row): #Tokenize the row words = word_tokenize(row) # Read the corpus into a list, # each entry in the list is one sentence. tagged_cess_sents = cess_esp.tagged_sents() # Train the unigram tagger uni_tag = ut(tagged_cess_sents) # Tagger reads a list of tokens. #uni_tag.tag(words) print("WORDS: ", words) print("TAGGGGS: ", uni_tag.tag(words)) ''' # Split corpus into training and testing set. train = int(len(tagged_cess_sents)*90/100) # Train a bigram tagger with only training data bi_tag = bt(tagged_cess_sents[:train], backoff=uni_tag) # Evaluates on testing data remaining 10% bi_tag.evaluate(tagged_cess_sents[train+1:]) # Using the tagger. #bi_tag.tag(row) print("TAGGGGS: ",bi_tag.tag(row)) ''' speach_tags = [ 'JJ', 'JJR', 'JJS', 'NN', 'VB', 'VBD', 'VBG', 'VBN', 'VBP' ] #We obtain the type of words that we have in the text, we use the pos_tag function tags = nltk.pos_tag(words) print("WORDS: ", words) print("TAGS: ", tags) #Now we ask if we found a negation in the words tags_2 = '' if "no" in words: tags_2 = tags[words.index("no"):] words_2 = words[words.index("no"):] words = words[:words.index("no")] print("tags_2 ", tags_2) print("words_2 ", words_2) print("words ", words) for index, word_tag in enumerate(tags_2): print("index ", index) if word_tag[1] in speach_tags: print("REPLACE", word_tag[0]) words = words + [replace_antonyms(word_tag[0]) ] + words_2[index + 2:] #break #print("WORDS: ",words) print("FINAL TAGS2: ", tags_2) print("FINAL WORDS: ", ' '.join(words)) return ' '.join(words)
def get_tagger(): global TAGGER if TAGGER is None: # TODO: Load tagger from pickled form training_data = cess_esp.tagged_sents() unigram_tagger = UnigramTagger(training_data) bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) TAGGER = bigram_tagger return TAGGER
def generateTagger(): default_tagger=nltk.DefaultTagger('V') patterns=[ (r'.*o$', 'NMS'), # noun masculine singular (r'.*os$', 'NMP'), # noun masculine plural (r'.*a$', 'NFS'), # noun feminine singular (r'.*as$', 'NFP') # noun feminine plural ] regexp_tagger=nltk.RegexpTagger(patterns, backoff=default_tagger) cess_tagged_sents=cess_esp.tagged_sents() combined_tagger=nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger) return combined_tagger
def main(): # Read the corpus into a list, # each entry in the list is one sentence. cess_sents = cess_esp.tagged_sents() # Train the unigram tagger uni_tag = ut(cess_sents) output = open('uni_tag.pkl', 'wb') dump(uni_tag, output, -1) output.close()
def __init__(self): if os.path.exists('tagger_spanish.pickle'): with open('tagger_spanish.pickle', 'r') as file_obj: self.tagger = pickle.load(file_obj) else: print 'tagger_spanish.pickle not found. Training tagger... may take a few minutes...' from nltk import UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import cess_esp sents = cess_esp.tagged_sents() unigram_tagger = UnigramTagger(sents) bigram_tagger = BigramTagger(sents, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word self.tagger = unigram_tagger with open('tagger_spanish.pickle', 'w') as file_obj: pickle.dump(self.tagger, file_obj) # Dump trained tagger
def make_and_save_combined_tagger(fname): default_tagger = nltk.DefaultTagger('v') patterns = [ (r'.*o$', 'n'), # noun masculine singular (r'.*os$', 'n'), # noun masculine plural (r'.*a$', 'n'), # noun feminine singular (r'.*as$', 'n') # noun feminine singular ] regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger) cess_tagged_sents = cess_esp.tagged_sents() combined_tagger = nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger) output = open(fname, 'wb') dump(combined_tagger, output, -1) output.close()
def __init__(self, mwe=True): self.mwe = mwe # Train tagger if it's used for the first time. try: loadtagger('cess_unigram.tagger').tag(['estoy']) loadtagger('cess_bigram.tagger').tag(['estoy']) except IOError: print "*** First-time use of cess tagger ***" print "Training tagger ..." from nltk.corpus import cess_esp as cess cess_sents = cess.tagged_sents() traintag('cess', cess_sents) # Trains the tagger with no MWE. cess_nomwe = unchunk(cess.tagged_sents()) tagged_cess_nomwe = batch_pos_tag(cess_nomwe) traintag('cess_nomwe', tagged_cess_nomwe) print # Load tagger. if self.mwe == True: self.uni = loadtagger('cess_unigram.tagger') self.bi = loadtagger('cess_bigram.tagger') elif self.mwe == False: self.uni = loadtagger('cess_nomwe_unigram.tagger') self.bi = loadtagger('cess_nomwe_bigram.tagger')
def generate_tagger(route): import nltk from nltk.corpus import cess_esp patterns = [ (r".*o$", "NMS"), (r".*os$", "NMP"), (r".*a$", "NFS"), (r".*as$", "NFP"), ] cesp_tsents = cess_esp.tagged_sents() td = nltk.DefaultTagger("s") tr = nltk.RegexpTagger(patterns, backoff=td) tu = nltk.UnigramTagger(cesp_tsents, backoff=tr) output = open(route + 'tagger.pkl', 'wb') dump(tu, output, -1) output.close()
def make_and_save_combined_tagger(fname): default_tagger=nltk.DefaultTagger('V') patterns=[ (r'.*o$', 'NMS'), # noun masculine singular (r'.*os$', 'NMP'), # noun masculine plural (r'.*a$', 'NFS'), # noun feminine singular (r'.*as$', 'NFP') # noun feminine singular ] regexp_tagger=nltk.RegexpTagger(patterns, backoff=default_tagger) #train nltk.UnigramTagger using tagged sentences from cess_esp cess_tagged_sents=cess_esp.tagged_sents() combined_tagger=nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger) #save the trained tagger in a file output=open(fname, 'wb') dump(combined_tagger, output, -1) output.close()
def printSpanishTags(self): sents = cess_esp.tagged_sents() tagger = HiddenMarkovModelTagger.train(sents) fullCorpus = self.fullCorpus() tagsDictionary = dict() for line in fullCorpus: spanishSentence = line[0] spanishTokens = re.compile('\W+', re.UNICODE).split(unicode(spanishSentence, 'utf-8')) tags = tagger.tag(spanishTokens) for idx, token in enumerate(spanishTokens): if (len(token) > 0): tag = tags[idx][1] sys.stdout.write(token.encode('utf-8')) sys.stdout.write(":") sys.stdout.write(tag) sys.stdout.write("\n")
def set_tagger(language): if language == 'English': _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) elif language == 'Catalan': training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) elif language == 'Spanish': training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return tagger
def generateTagger(clean_vocabulary): fname = 'combined_taggerP.pkl' default_tagger = nltk.DefaultTagger('V') patterns=[ (r'.*o$', 'NMS'), # noun masculine singular (r'.*os$', 'NMP'), # noun masculine plural (r'.*a$', 'NFS'), # noun feminine singular (r'.*as$', 'NFP') # noun feminine plural ] regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger) cess_tagged_sents = cess_esp.tagged_sents() combined_tagger = nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger) s_tagged = combined_tagger.tag(clean_vocabulary) output = open(fname, 'wb') dump(s_tagged, output, -1) output.close() return s_tagged
def init_tagger_chunkers(): # Leer corpus print("Leyendo cess_esp ...") sents = cess_esp.tagged_sents() corpus = utils.read_corpus(CORPUS_PATH) train, test = utils.split_to_train_test_set(corpus) # Entrenar tagger print("Entrenando tagger ...") tagger = Tagger.Tagger(TAGGER, sents) # Definir gramática de cantidad y comida print("Inicializando regex chunker ...") grammar = {} grammar[INGREDIENTE] = INGREDIENTE_GRAMMAR grammar[COMIDA] = COMIDA_GRAMMAR grammar[CANTIDAD] = CANTIDAD_GRAMMAR grammar[PEDIDO] = PEDIDO_GRAMMAR regex_chunker = RegexChunker.RegexChunker(grammar) # Guardar iob tags utils.write_iob_tags(IOB_INPUT_FILE, corpus, tagger, regex_chunker) # Leer iob tags train_data, test_data = utils.read_iob_tag(IOB_INPUT_FILE) print("Inicializando unigram chunker ...") unigram_chunker = NGramChunker.Chunker(train_data, TO_DETECT_LIST) print("Inicializando bigram chunker ...") bigram_chunker = NGramChunker.Chunker(train_data, TO_DETECT_LIST, 2) print("Inicializando trigram chunker ...") trigram_chunker = NGramChunker.Chunker(train_data, TO_DETECT_LIST, 3) print("Inicializando bayes chunker ...") bayes_chunker = ConsecutiveNPChunker.ConsecutiveNPChunker( train_data, TO_DETECT_LIST) chunkers = { REGEX: regex_chunker, UNIGRAM: unigram_chunker, BIGRAM: bigram_chunker, TRIGRAM: trigram_chunker, BAYES: bayes_chunker } return tagger, chunkers
def __init__(self): """ Initializes the tagger object """ # nltk.download("cess_esp") # Read the corpus into a list, each entry in the list is one sentence. self.cess_sents = cess.tagged_sents() self.tagger_type = NLTK_TAGGER_NAME # This part is done only once, and we don't have to run it again. # We keep the code here to understand how we created the model. # Train the unigram tagger # tagger = ut(self.cess_sents) # Store the trained tagger to a file # f = open('nltk_spanish_tagger.pickle', 'wb') # pickle.dump(tagger, f) # f.close() # Open the already trained model with open('nltk_spanish_tagger.pickle', 'rb') as f: # Load the tagger self.tagger = pickle.load(f)
def __init__(self): cess_sents = cess.tagged_sents() self.uni_tag = ut(cess_sents) self.model = NgramModel(3, brown.words()) self.translation = [] self.dictionary = collections.defaultdict(lambda: 0) dictionaryFile = open("../corpus/Dictionary.txt", 'r') for translation in dictionaryFile: spanish, english = translation.split(" - ") spanish = spanish.decode('utf-8') self.dictionary[spanish] = collections.defaultdict(lambda: []) english = english.rstrip(';\n').split('; ') for pos in english: pos = pos.split(': ') self.dictionary[spanish][pos[0]] = pos[1].split(', ') self.sentences = [] sentencesFile = open("../corpus/TestSet.txt", 'r') for sentence in sentencesFile: self.sentences.append(sentence.rstrip('\n'))
'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(simplify_tags=True), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG' #Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT=0.30
# back to our original example, one phrase is more dense than the other. # <codecell> assert density("en", nltk.pos_tag(tokenize("If I were you I wouldn’t do that with these."))) \ < density("en", nltk.pos_tag(tokenize("The quick brown fox jumps over the lazy dog."))) # <markdowncell> # Tag Spanish Text # ================ # <codecell> from nltk.corpus import cess_esp sents = cess_esp.tagged_sents() # <markdowncell> # Split into training and test set # <codecell> training_dx = int(len(sents)*90/100) training = sents[:training_dx] test = sents[training_dx+1:] # <markdowncell> # train tagger and check accuracy (this takes 40 seconds or so) ...
import nltk from nltk.corpus import cess_esp from pickle import dump patterns = [ (r".*o$","NMS"), (r".*os$","NMP"), (r".*a$","NFS"), (r".*as$","NFP"), ] cesp_tsents = cess_esp.tagged_sents() td = nltk.DefaultTagger("s") tr = nltk.RegexpTagger(patterns, backoff = td ) tu = nltk.UnigramTagger(cesp_tsents, backoff = tr ) output = open("tagger.pkl","wb") dump(tu,output,-1) output.close()
from nltk.tokenize import word_tokenize #adaptacion SPANISH from nltk.corpus import cess_esp nltk.tag.mapping._load_universal_map("es-cast3lb") mapdict = nltk.tag.mapping._MAPPINGS["es-cast3lb"]["universal"] alltags = set(t for w, t in cess_esp.tagged_words()) for tag in alltags: if len(tag) <= 2: # These are complete continue mapdict[tag] = mapdict[tag[:2]] cess_esp._tagset = "es-cast3lb" from nltk import UnigramTagger as ut from nltk import BigramTagger as bt cess_sents = cess_esp.tagged_sents(tagset='universal') uni_tag = ut(cess_sents, backoff=nltk.DefaultTagger('X')) class VoteClassifier(ClassifierI): def __init__(self, *classifiers): self._classifiers = classifiers def classify(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) return mode(votes) def confidence(self, features):
'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(simplify_tags=True), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = '#F00' #red _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG' #Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT = 0.30
import A from sklearn.feature_extraction import DictVectorizer from sklearn import svm from nltk import word_tokenize from nltk.corpus import cess_esp from nltk.corpus import cess_cat from nltk.data import load from sklearn import svm import nltk from nltk import UnigramTagger as ut tagger_cat = ut(cess_cat.tagged_sents()) tagger_esp = ut(cess_esp.tagged_sents()) # You might change the window size window_size = 15 def b1_base(data): ''' :param data: list of instances for a given lexelt with the following structure: { [(instance_id, left_context, head, right_context, sense_id), ...] } :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...] :return: vectors: A dictionary with the following structure { instance_id: [w_1 count, w_2 count, ...], ... } labels: A dictionary with the following structure { instance_id : sense_id } '''
'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='simple'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = '#F00' #red _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG' #Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT = 0.30
def get_tagged_sentences(self): return cess_esp.tagged_sents()
from nltk.corpus import cess_esp as cess from nltk import RegexpTokenizer import nltk import pickle # My sentences sentence = "hola, hola, soy Pedro ¿como te llamas?." tokenizer = RegexpTokenizer(r'\w+') tokenized_words = tokenizer.tokenize(sentence) # Dec train/test train = None test = None cess_sents = cess.tagged_sents() try: with open('test_pickles/test_data.pickle', 'rb') as fa: div = pickle.load(fa) train = cess_sents[:div] test = cess_sents[div+1:] except FileNotFoundError as a: # training data print("dumping train/test") div = len(cess_sents)*90//100 train = cess_sents[:div] test = cess_sents[div+1:] with open('test_pickles/test_data.pickle', 'wb') as fb: pickle.dump(div, fb) ##### #
'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='simple'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG' #Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT=0.30
def init(): cess_sents = cess.tagged_sents() unitag = ut(cess_sents) pass
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG" # Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT = 0.30
#! /usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import cess_esp, brown from nltk import UnigramTagger, BigramTagger from cPickle import dump # read corpus corpusEs = cess_esp.tagged_sents() corpusEn = brown.tagged_sents() # Train the unigram taggers uniTagEs = UnigramTagger(corpusEs) uniTagEn = UnigramTagger(corpusEn) # write out files outputEs = open('uniTag.es.pkl', 'wb') outputEn = open('uniTag.en.pkl', 'wb') dump(uniTagEs, outputEs, -1) dump(uniTagEn, outputEn, -1) outputEs.close() outputEn.close()
'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = '#F00' #red _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG' #Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT = 0.30
# POS-Tagging (en ingés, no funciona en español) # tagged = nltk.pos_tag(tokens) # print "Tagged:" # for t in tagged: # print "\t", "\t".join(t) # POS-Tagging sencillo (en español) # Para poder taggear en español necesitamos primero entrenar # al tagger con un corpus de frases ya calificadas. from nltk.corpus import cess_esp as cess # El corpus from nltk import UnigramTagger as ut # El tagger (de una palabra) from nltk import BigramTagger as bt # El tagger (de a dos palabras) # Leemos el corpus a una lista # Cada entrada de la lista es una frase cess_sents = cess.tagged_sents() # Dividimos el corpus en dos partes: entrenamiento y testeo train = int(len(cess_sents) * 90 / 100) # 90% entrenamiento import pickle crear_taggers = True if crear_taggers: # Entrenamos el tagger de unigramas (no tiene caso el testeo con unigramas) uni_tag = ut(cess_sents) # Entramos el tagger de bigramas con sólamente la data de entrenamiento bi_tag = bt(cess_sents[:train]) # Guardamos los taggers en archivos para ahorrar tiempo la siguiente vez with open('test/cess_unigram.tagger.pkl', 'wb') as output: pickle.dump(uni_tag, output, pickle.HIGHEST_PROTOCOL)
from nltk.tag.stanford import POSTagger # For French and German import json # To be able to read json files import urllib2 # To be able to read url (speech) containing json import collections # The collection module has useful functions to count automatically frequencies from collections import Counter # Module useful for frequencies in a list import MySQLdb # Module for Mysql database import datetime # Module to use date and time from config import DB_USER,DB_PWD,DB_NAME # Database Settings languages = ["ar","de","en","es","fr","it","ko","no","pt","sv","zh"] # languages supported by this parser punctuation = [",",";",".",":","!","?","(",")","-","%","\"","[","]"] # all marks of punctuation sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # Loading English file to detect sentences cess_sents = cess.tagged_sents() # Spanish sentences uni_tag = ut(cess_sents) # Tagging spanish sentences st_tag_english = NERTagger('english.all.3class.distsim.crf.ser.gz','stanford-ner.jar') # Entities for English st_pos_french = POSTagger('french.tagger', 'stanford-postagger.jar') # French Grammar st_pos_german = POSTagger('german-fast.tagger', 'stanford-postagger.jar') # German Grammar st_tag_german = NERTagger('hgc_175m_600.crf.ser.gz','stanford-ner.jar') # Entities for German chunker = nltk.data.load('chunkers/maxent_ne_chunker/english_ace_multiclass.pickle') # Loads the Chunk Parser maxEnt = chunker._tagger.classifier() # The tag classifier for entities # Please note that at the moment English, French, German and Sapnish are the only languages supported by this program # It is slower in German and better in English: in English it is possible to define a probability for entities (with the Chunk Parser) #-------------------------------------------------------------------------------------------------------------------------------------- # Global variables
'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG' #Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT=0.30
elif tag2.startswith('a'): number2, gender2 = self.number_from_adjective( tag2), self.gender_from_adjective(tag2) else: return True return number1 == number2 and (gender1 == "i" or gender2 == "i" or gender1 == gender2) def number_from_verb(self, tag): return tag[5] def gender_from_verb(self, tag): return tag[2] def number_from_pronom(self, tag): return tag[4] def gender_from_pronom(self, tag): return tag[3] def number_from_adjective(self, tag): return tag[4] def gender_from_adjective(self, tag): return tag[3] if __name__ == "__main__": tagger = cess_esp.tagged_sents()
from clean_tokens import * from mutual_information import * import nltk from write import writeList from nltk.corpus import cess_esp from mutual_information import getSentences def tag_spanish_sentence(sentence, tagger): tokens = nltk.word_tokenize(sentence) s_tagged = tagger.tag(tokens) return s_tagged if __name__ == '__main__': """obteniendo el texto para tokenizar por oraciones""" """ fname='C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm' text_string=get_text_string(fname) sentences = getSentences(text_string) sent = sentences[12] #print(type(sent)) text = nltk.word_tokenize(sent) cad = nltk.Text(text) print(nltk.pos_tag(cad)) """ tagget_sents = cess_esp.tagged_sents() #print(tagget_sents) tagger = nltk.UnigramTagger(tagget_sents) #tagger.load(input)
"Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="universal"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="universal"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="universal"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="universal"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG" # Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT = 0.30
def __init__(self): cess_sents = cess.tagged_sents() self.uni_tag = ut(cess_sents) train = int(len(cess_sents)*90/100) # 90% self.bi_tag = bt(cess_sents[:train]) self.bi_tag.evaluate(cess_sents[train+1:])