def __init__(self, tsents=floresta.tagged_sents()):
     """
     :param tsents: list of annotated sententeces
     """
     self.__corpus = tsents
     self.__is_trained = False
     self.__tagger = None
Esempio n. 2
0
def accuracy_measure():
    basicConfig(format='%(levelname)s %(message)s', level=INFO)
    info('reading tagged sentences')
    info('simplifying tags')

    # tagged sentences
    flo_tsents = simplified_sents_floresta(floresta.tagged_sents())
    mac_tsents = mac_morpho.tagged_sents()

    # FLORESTA test and train data
    flo_size = int(len(flo_tsents) * 0.9)
    flo_train = flo_tsents[:flo_size]
    flo_test = flo_tsents[flo_size:]

    # MAC MORPHO test and train data
    mac_size = int(len(mac_tsents) * 0.9)
    mac_train = mac_tsents[:mac_size]
    mac_test = mac_tsents[mac_size:]

    no_backoff_taggers(flo_test, flo_train)
    no_backoff_taggers(mac_test, mac_train, corpus='macmorpho')

    if not pt.check_for_taggers():
        save = True
    else:
        save = False

    backoff_taggers(flo_test, flo_train, save)
    backoff_taggers(mac_test, mac_train, save, corpus='macmorpho')
def add_start_end(start, end):
    tags_words = []
    tsents = floresta.tagged_sents()
    for sent in tsents[start:end]:
        tags_words.append(("START", "START"))
        tags_words.extend([(simplify_tag(t), w) for (w, t) in sent])
        tags_words.append(("END", "END"))
    return tags_words
Esempio n. 4
0
def tokenize():
    tsents = floresta.tagged_sents()
    tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
    train = tsents[100:]
    test = tsents[:100]
    tagger0 = nltk.DefaultTagger('n')
    tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
    return tagger1
Esempio n. 5
0
def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]

    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]

    elif corpus.lower() == "floresta":
        from nltk.corpus import floresta
        tagged_sents = floresta.tagged_sents()[:num_sents]

    elif corpus.lower() == "cintil":
        print "Loading CINTIL"
        #column_types = ['ignore','words','ignore','ignore','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types)
        column_types = ['words','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types)
        cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types)
        tagged_sents = cintil.tagged_sents()[:num_sents]

    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)

    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)

    maxent_tagger.evaluate(test_sents)

    """
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"])
    print "\n\n"
    print "show the 40 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(40)
    """

    fModel = open('test.pkl',"wb")
    pickle.dump(maxent_tagger, fModel,1)
    fModel.close()
def floresta_tagger():
    # import nltk.data
    # tagger = nltk.data.load("taggers/NAME_OF_TAGGER.pickle")
    from nltk.corpus import floresta
    tsents = floresta.tagged_sents()
    tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent]
              for sent in tsents if sent]
    train = tsents[100:]
    test = tsents[:100]
Esempio n. 7
0
def retrieve_traindata():
    tsents = floresta.tagged_sents()

    tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent]
              for sent in tsents if sent]
    tsents = [[(w.lower(), convert_to_universal_tag(t)) for (w, t) in sent]
              for sent in tsents if sent]

    return tsents
Esempio n. 8
0
 def tagger_training_corpus():
     corpus = []
     for sent in floresta.tagged_sents():
         new_sent = []
         for word in sent:
             if word[0].lower() not in string.punctuation:
                 new_sent.append(word)
         corpus.append(new_sent)
     return corpus
    def run(self, corpus=Corpus.FLORESTA, force=False):
        self.should_force = force

        if corpus == Corpus.FLORESTA:
            print("\n##### Floresta Corpus #####")
            floresta_sent = floresta.tagged_sents()
            self.train("floresta", floresta_sent)
        elif corpus == Corpus.MAC_MORPHO:
            print("\n###### Mac Morpho Corpus #####")
            mac_morpho_sent = mac_morpho.tagged_sents()
            self.train("mac_morpho", mac_morpho_sent)
Esempio n. 10
0
def train_tagger():
    print("Training taggers, please wait...")

    # Simplificar as tags das frases que estão no módulo floresta
    tsents = floresta.tagged_sents()
    tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent]
              for sent in tsents if sent]

    tagger0 = nltk.DefaultTagger("n")
    tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0)
    tagger2 = nltk.BigramTagger(tsents, backoff=tagger1)

    return tagger2
Esempio n. 11
0
def train_pos_tagger():

    tsents = floresta.tagged_sents()
    #Tira informações desnecessárias que acompanham as tags
    tsents = [[(w, simplify_tag(t)) for (w, t) in sent] for sent in tsents
              if sent]

    #Pega as 7 mil primeiras sentenças do floresta
    train_data = tsents[:7000]

    #Pega as 2266 sentenças finais do floresta
    test_data = tsents[7000:]

    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)

    #Descomente as duas linhas abaixo se tiver curiosidade de avaliar o desempenho do classificador treinado
    #res = tnt_pos_tagger.evaluate(test_data)
    #print('Desempenho do tnt_pos_tagger: ', res)

    return tnt_pos_tagger
Esempio n. 12
0

def normalize(s, punctuation="!?.:;,()[] "):
    s = s.decode("iso-8859-1")
    s = s.strip()
    s = s.strip(punctuation)
    return s


with open("sentence-pt") as content_file:
    content = content_file.read()
    content = normalize(content)

    tokens = word_tokenize(content)

    sents = floresta.tagged_sents()

    uni_tag = ut(sents)
    print uni_tag.tag(tokens)

    # Split corpus into training and testing set.
    train = int(len(sents) * 90 / 100)  # 90%

    # Train a bigram tagger with only training data.
    bi_tag = bt(sents[:train])

    # Evaluates on testing data remaining 10%
    bi_tag.evaluate(sents[train + 1 :])

    # Using the tagger.
    print bi_tag.tag(tokens)
Esempio n. 13
0
from nltk.corpus import floresta
import nltk

tags = []
for sent in floresta.tagged_sents():
    for (palavra, tag) in sent:
        tags.append(tag)
freq = nltk.FreqDist(tags)
print(freq.most_common(50))
Esempio n. 14
0
            'English: Wall Street Journal Corpus (simplified)':
                lambda: treebank.tagged_sents(tagset='simple'),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.tagged_sents(),
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(tagset='simple'),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='simple'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='simple'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='simple'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='simple'),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
Esempio n. 15
0
def get_training():
    training = floresta.tagged_sents()[10:]
    target = floresta.tagged_sents()[:10]
    return training, target
Esempio n. 16
0
            'English: Wall Street Journal Corpus (simplified)':
                lambda: treebank.tagged_sents(simplify_tags=True),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.tagged_sents(),
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(simplify_tags=True),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(simplify_tags=True),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(simplify_tags=True),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(simplify_tags=True),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(simplify_tags=True),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
Esempio n. 17
0
def get_dataset():
    nltk.download("floresta")
    tsents = floresta.tagged_sents()
    tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent]
              for sent in tsents if sent]
    return tsents
def main() -> None:
    """
    Program entry point. Main execution flow of the POS tagger:
        - load corpus data
        - split data into train/testing sets
        - handle unknown words in both sets
        - train tagger (calculate HMM's word emission and tag transition probabilities)
        - test tagger (Viberbi algorithm, backtracing and accuracy measure)
    :return: None.
    """
    parse_command_line_arguments()

    if config.corpus == "brown":  # Default corpus.
        # Only need to do this once.
        # download_brown_corpus()
        # Retrieve tagged sentences from the Brown corpus
        tagged_sentences = brown.tagged_sents(tagset='universal')
        print("Corpus used: Brown Corpus (universal tagset)")
        if config.debug:
            print_corpus_information(brown, "Brown Corpus")
    elif config.corpus == "floresta":
        # Only need to do this once.
        # download_floresta_corpus()
        tagged_sentences = floresta.tagged_sents()
        print("Corpus used: Floresta Treebank")
        if config.debug:
            print_corpus_information(brown, "Floresta Treebank")

    # Start measuring runtime.
    start_time = time.time()

    # Split data into a training and a testing set (default split 95/5 sentences).
    training_set, testing_set = split_train_test_data(tagged_sentences)
    if config.debug:
        print_number_of_sentences(training_set, "training dataset")
        print_number_of_sentences(testing_set, "testing dataset")

    # Replace infrequent words with special 'UNK' tags.
    training_words = extract_words(training_set)
    unique_training_words = remove_list_duplicates(training_words)
    training_set = handle_unknown_words(training_set,
                                        unique_training_words,
                                        is_training_set=True)
    testing_set = handle_unknown_words(testing_set,
                                       unique_training_words,
                                       is_training_set=False)

    # Store all words and all tags from the training dataset in a ordered lists (and make lists without duplicates).
    training_tags = extract_tags(training_set)
    unique_training_tags = remove_list_duplicates(training_tags)

    # Train the POS tagger by generating the tag transition and word emission probability matrices of the HMM.
    tag_transition_probabilities, emission_probabilities = train_tagger(
        training_set, training_tags)

    # Test the POS tagger on the testing data using the Viterbi back-tracing algorithm.
    test_tagger(testing_set, unique_training_tags,
                tag_transition_probabilities, emission_probabilities)

    print_runtime(round(time.time() - start_time,
                        2))  # Record and print runtime.
Esempio n. 19
0
    'English: Wall Street Journal Corpus (simplified)':
    lambda: treebank.tagged_sents(tagset='simple'),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.tagged_sents(),
    'Chinese: Sinica Corpus (simplified)':
    lambda: sinica_treebank.tagged_sents(tagset='simple'),
    'Dutch: Alpino Corpus':
    lambda: alpino.tagged_sents(),
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(tagset='simple'),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(tagset='simple'),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(tagset='simple'),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(tagset='simple'),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
Esempio n. 20
0
    'English: Wall Street Journal Corpus (simplified)':
    lambda: treebank.tagged_sents(simplify_tags=True),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.tagged_sents(),
    'Chinese: Sinica Corpus (simplified)':
    lambda: sinica_treebank.tagged_sents(simplify_tags=True),
    'Dutch: Alpino Corpus':
    lambda: alpino.tagged_sents(),
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(simplify_tags=True),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(simplify_tags=True),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(simplify_tags=True),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(simplify_tags=True),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
Esempio n. 21
0
# # e http://www.nltk.org/howto/portuguese_en.html)
import nltk
from nltk.corpus import floresta
from tqdm import tqdm  # importamos o tqdm para gerar barras de progresso facilmente.


# Função auxiliar que realiza a extração da classe sintática de uma palavra da tag (ou anotação) para aquela
# palavra no córpus, removendo as partes da tag que não são relevantes neste contexto:
def extract_pos(tag):
    if '+' in tag:
        return tag.split('+')[1]
    else:
        return tag


sentences = floresta.tagged_sents()  # lista de sentenças taggeadas originais do córpus.

# Preenchemos a lista tagged_sentences com os dados do córpus com as tags já filtradas, correspondentes às
# classes gramaticais:
tagged_sentences = []
print('Extraindo classes gramaticais das tags do córpus:')
for sent in tqdm(sentences):
    tagged_sent = []
    for (word, tag) in sent:
        tagged_sent.append((word.lower(), extract_pos(tag)))
    tagged_sentences.append(tagged_sent)

# Dividimos o córpus em um conjunto de treinamento e umm conjunto de testes:
train = tagged_sentences[100:]
test = tagged_sentences[:100]
Esempio n. 22
0
from nltk.tag import hmm
from nltk.util import unique_list
from nltk.probability import *
from nltk import ConditionalProbDist
from nltk import ConditionalFreqDist
from collections import Counter

from HMM import *

# Load the Training and Test Sentences
print("Downloading Training Sentences from Corpus")
trainingSentences_brown = brown.tagged_sents(tagset="universal")[:10000]
trainingSentences_conll2000 = conll2000.tagged_sents()[:10000]
trainingSentences_alpino = alpino.tagged_sents()[:10000]
trainingSentences_floresta = floresta.tagged_sents()[:10000]
print "Done!"

print("Downloading Test Sentences from Corpus")
testSentences_brown = brown.tagged_sents(tagset="universal")[10000:10500]
testSentences_conll2000 = conll2000.tagged_sents()[10000:10500]
testSentences_alpino = alpino.tagged_sents()[10000:10500]
testSentences_floresta = floresta.tagged_sents()[10000:10500]
print "Done!"


# Extracts words and tags from Sentences
def extractWords_and_Tags(sentences):
    words = {}
    tags = {}
    for sentence in sentences:
Esempio n. 23
0
    'English: Wall Street Journal Corpus (simplified)':
    lambda: treebank.tagged_sents(tagset='universal'),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.tagged_sents(),
    'Chinese: Sinica Corpus (simplified)':
    lambda: sinica_treebank.tagged_sents(tagset='universal'),
    'Dutch: Alpino Corpus':
    lambda: alpino.tagged_sents(),
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(tagset='universal'),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(tagset='universal'),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(tagset='universal'),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(tagset='universal'),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
Esempio n. 24
0
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="simple"
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"

    _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0"  # dark grey
Esempio n. 25
0
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
        tagset="universal"
    ),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
        tagset="universal"
    ),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
        tagset="universal"
    ),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
        files="hindi.pos", tagset="universal"
    ),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
        tagset="universal"
    ),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
        tagset="universal"
    ),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
        tagset="universal"
    ),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white
            'English: Wall Street Journal Corpus (simplified)':
                lambda: treebank.tagged_sents(tagset='universal'),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.tagged_sents(),
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(tagset='universal'),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='universal'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='universal'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='universal'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='universal'),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
Esempio n. 27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import floresta
import cPickle

FILENAME = "txts/floresta_trigram.pos"

def simplify_tag(t):
    if '+' in t:
        return t[t.index('+')+1:]
    else:
        return t

tsents = floresta.tagged_sents()
tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
train = tsents[100:]
test = tsents[:100]

tagger0 = nltk.DefaultTagger('n')
tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
tagger = nltk.TrigramTagger(train, backoff=tagger2)

tagger.evaluate(test)

with open(FILENAME, 'wb') as outFile:
    cPickle.dump(tagger, outFile, -1)
Esempio n. 28
0
    "English: Wall Street Journal Corpus (simplified)":
    lambda: treebank.tagged_sents(tagset="universal"),
    "Chinese: Sinica Corpus":
    lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)":
    lambda: sinica_treebank.tagged_sents(tagset="universal"),
    "Dutch: Alpino Corpus":
    lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)":
    lambda: alpino.tagged_sents(tagset="universal"),
    "Hindi: Indian Languages Corpus":
    lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)":
    lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"),
    "Portuguese: Floresta Corpus (Portugal)":
    lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)":
    lambda: floresta.tagged_sents(tagset="universal"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)":
    lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)":
    lambda: mac_morpho.tagged_sents(tagset="universal"),
    "Spanish: CESS-ESP Corpus (simplified)":
    lambda: cess_esp.tagged_sents(tagset="universal"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
def __create_tagger():
    train_floresta_sents = floresta.tagged_sents()
    mac_morpho_sents = nltk.corpus.mac_morpho.tagged_sents()
    tagger = UnigramTagger(mac_morpho_sents)
    bigram_tagger = nltk.BigramTagger(train_floresta_sents, backoff=tagger)
    return bigram_tagger