Esempio n. 1
0
def get_tagger(lang):
    if lang == "English":
        global eng_tagger
        if eng_tagger:
            return eng_tagger
        else:
            _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
            eng_tagger = load(_POS_TAGGER)
            return eng_tagger
    elif lang == "Spanish":
        global spa_tagger
        if spa_tagger:
            return spa_tagger
        else:
            training = cess_esp.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,
                                                backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return spa_tagger
    else:
        global cat_tagger
        if cat_tagger:
            return cat_tagger
        else:
            training = cess_cat.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,
                                                backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return cat_tagger
Esempio n. 2
0
def run(train, test, language, answer):
    results = {}
    if language == 'English':
        _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tagger = load(_POS_TAGGER)
    elif language == 'Spanish':
        tagger = ut(cess_esp.tagged_sents())
    elif language == 'Catalan':
        tagger  = ut(cess_cat.tagged_sents())

    for lexelt in train:

        train_features, y_train = extract_features(train[lexelt],language,tagger)
        test_features, _ = extract_features(test[lexelt],language,tagger)

        X_train, X_test = vectorize(train_features,test_features)
        X_train_new, X_test_new = feature_selection(X_train, X_test,y_train)
        results[lexelt] = classify(X_train_new, X_test_new,y_train)
    """
    B1.c
    for lexelt in train:
        features = getBestWords(train[lexelt], 30)
        train_features = countFeature(features, train[lexelt])
        _, y_train = extract_features(train[lexelt], language)
        test_features = countFeature(features, test[lexelt])

        X_train, X_test = vectorize(train_features, test_features)
        results[lexelt] = classify(X_train, X_test, y_train)
    B1.c
    """
    A.print_results(results, answer)
def get_tagger(lang):
    if lang == "English":
        global eng_tagger
        if eng_tagger:
            return eng_tagger
        else:
            _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
            eng_tagger = load(_POS_TAGGER)
            return eng_tagger
    elif lang == "Spanish":
        global spa_tagger
        if spa_tagger:
            return spa_tagger
        else:
            print 111
            training = cess_esp.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            print 555
            return spa_tagger
    else:
        global cat_tagger
        if cat_tagger:
            return cat_tagger
        else:
            training = cess_cat.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return cat_tagger
Esempio n. 4
0
File: B.py Progetto: keyu-lai/NLP
    def Catalan_tagger():
        import nltk
        from nltk.corpus import cess_cat

        training = cess_cat.tagged_sents()
        default_tagger = nltk.DefaultTagger("NOUN")
        bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
        trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
        return trigram_tagger
Esempio n. 5
0
    def __init__(self, train_percent_size=1):
        """

        :param train_percent_size: 0-1
        :return:
        """
        catalan_sents = cat_corpus.tagged_sents()
        subset = subset_from_corpus(catalan_sents, train_percent_size)
        self._tagger = trained_tagger_with_corpus(subset)
Esempio n. 6
0
def set_tagger(language):
    if language == 'English':
        _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tagger = load(_POS_TAGGER)
    elif language == 'Catalan':
        training = cess_cat.tagged_sents()
        default_tagger = nltk.DefaultTagger('NN')
        unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
        bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
        tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
    elif language == 'Spanish':
        training = cess_esp.tagged_sents()
        default_tagger = nltk.DefaultTagger('NN')
        unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
        bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
        tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    return tagger
Esempio n. 7
0
 def test_catalan(self):
     words = cess_cat.words()[:15]
     txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
     self.assertEqual(words, txt.split())
     self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
Esempio n. 8
0
WORD_OR_TAG = '[^/ ]+'
BOUNDARY = r'\b'

CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
            'Catalan: CESS-CAT Corpus (simplified)':
                lambda: cess_cat.tagged_sents(simplify_tags=True),
            'English: Brown Corpus':
                lambda: brown.tagged_sents(),
            'English: Brown Corpus (simplified)':
                lambda: brown.tagged_sents(simplify_tags=True),
            'English: Brown Corpus (Press, simplified)':
                lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True),
            'English: Brown Corpus (Religion, simplified)':
                lambda: brown.tagged_sents(categories='religion', simplify_tags=True),
            'English: Brown Corpus (Learned, simplified)':
                lambda: brown.tagged_sents(categories='learned', simplify_tags=True),
            'English: Brown Corpus (Science Fiction, simplified)':
                lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True),
            'English: Brown Corpus (Romance, simplified)':
                lambda: brown.tagged_sents(categories='romance', simplify_tags=True),
            'English: Brown Corpus (Humor, simplified)':
 def test_catalan(self):
     words = cess_cat.words()[:15]
     txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
     self.assertEqual(words, txt.split())
     self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
Esempio n. 10
0
import nltk
from nltk.corpus import cess_cat
from nltk.tokenize import word_tokenize
tagged_sents = cess_cat.tagged_sents()
unigram_tagger = nltk.UnigramTagger(tagged_sents)
oracio = "avui fa sol però demà plourà"
tokens = word_tokenize(oracio)
analisi = unigram_tagger.tag(tokens)
print(analisi)
Esempio n. 11
0
import A
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from nltk import word_tokenize
from nltk.corpus import cess_esp
from nltk.corpus import cess_cat
from nltk.data import load
from sklearn import svm
import nltk
from nltk import UnigramTagger as ut

tagger_cat  = ut(cess_cat.tagged_sents())
tagger_esp = ut(cess_esp.tagged_sents())
# You might change the window size
window_size = 15

def b1_base(data):
    '''
    :param data: list of instances for a given lexelt with the following structure:
        {
                        [(instance_id, left_context, head, right_context, sense_id), ...]
        }
    :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
    :return: vectors: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }
            labels: A dictionary with the following structure
            { instance_id : sense_id }

    '''
Esempio n. 12
0
BOUNDARY = r'\b'

CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
    'Catalan: CESS-CAT Corpus (simplified)':
    lambda: cess_cat.tagged_sents(tagset='universal'),
    'English: Brown Corpus':
    lambda: brown.tagged_sents(),
    'English: Brown Corpus (simplified)':
    lambda: brown.tagged_sents(tagset='universal'),
    'English: Brown Corpus (Press, simplified)':
    lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'],
                               tagset='universal'),
    'English: Brown Corpus (Religion, simplified)':
    lambda: brown.tagged_sents(categories='religion', tagset='universal'),
    'English: Brown Corpus (Learned, simplified)':
    lambda: brown.tagged_sents(categories='learned', tagset='universal'),
    'English: Brown Corpus (Science Fiction, simplified)':
    lambda: brown.tagged_sents(categories='science_fiction',
                               tagset='universal'),
    'English: Brown Corpus (Romance, simplified)':
Esempio n. 13
0
BOUNDARY = r"\b"

CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
        tagset="universal"
    ),
    "English: Brown Corpus": lambda: brown.tagged_sents(),
    "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
        tagset="universal"
    ),
    "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
        categories=["news", "editorial", "reviews"], tagset="universal"
    ),
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
        categories="religion", tagset="universal"
    ),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
        categories="learned", tagset="universal"
    ),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
Esempio n. 14
0
WORD_OR_TAG = "[^/ ]+"
BOUNDARY = r"\b"

CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(tagset="simple"),
    "English: Brown Corpus": lambda: brown.tagged_sents(),
    "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="simple"),
    "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
        categories=["news", "editorial", "reviews"], tagset="simple"
    ),
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="simple"
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
Esempio n. 15
0
BOUNDARY = r'\b'

CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
            'Catalan: CESS-CAT Corpus (simplified)':
                lambda: cess_cat.tagged_sents(tagset='simple'),
            'English: Brown Corpus':
                lambda: brown.tagged_sents(),
            'English: Brown Corpus (simplified)':
                lambda: brown.tagged_sents(tagset='simple'),
            'English: Brown Corpus (Press, simplified)':
                lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='simple'),
            'English: Brown Corpus (Religion, simplified)':
                lambda: brown.tagged_sents(categories='religion', tagset='simple'),
            'English: Brown Corpus (Learned, simplified)':
                lambda: brown.tagged_sents(categories='learned', tagset='simple'),
            'English: Brown Corpus (Science Fiction, simplified)':
                lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'),
            'English: Brown Corpus (Romance, simplified)':
                lambda: brown.tagged_sents(categories='romance', tagset='simple'),
            'English: Brown Corpus (Humor, simplified)':
BOUNDARY = r'\b'

CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
            'Catalan: CESS-CAT Corpus (simplified)':
                lambda: cess_cat.tagged_sents(tagset='universal'),
            'English: Brown Corpus':
                lambda: brown.tagged_sents(),
            'English: Brown Corpus (simplified)':
                lambda: brown.tagged_sents(tagset='universal'),
            'English: Brown Corpus (Press, simplified)':
                lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'),
            'English: Brown Corpus (Religion, simplified)':
                lambda: brown.tagged_sents(categories='religion', tagset='universal'),
            'English: Brown Corpus (Learned, simplified)':
                lambda: brown.tagged_sents(categories='learned', tagset='universal'),
            'English: Brown Corpus (Science Fiction, simplified)':
                lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'),
            'English: Brown Corpus (Romance, simplified)':
                lambda: brown.tagged_sents(categories='romance', tagset='universal'),
            'English: Brown Corpus (Humor, simplified)':
Esempio n. 17
0
BOUNDARY = r'\b'

CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
    'Catalan: CESS-CAT Corpus (simplified)':
    lambda: cess_cat.tagged_sents(tagset='simple'),
    'English: Brown Corpus':
    lambda: brown.tagged_sents(),
    'English: Brown Corpus (simplified)':
    lambda: brown.tagged_sents(tagset='simple'),
    'English: Brown Corpus (Press, simplified)':
    lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'],
                               tagset='simple'),
    'English: Brown Corpus (Religion, simplified)':
    lambda: brown.tagged_sents(categories='religion', tagset='simple'),
    'English: Brown Corpus (Learned, simplified)':
    lambda: brown.tagged_sents(categories='learned', tagset='simple'),
    'English: Brown Corpus (Science Fiction, simplified)':
    lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'),
    'English: Brown Corpus (Romance, simplified)':
    lambda: brown.tagged_sents(categories='romance', tagset='simple'),
Esempio n. 18
0
BOUNDARY = r"\b"

CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus (simplified)":
    lambda: cess_cat.tagged_sents(tagset="universal"),
    "English: Brown Corpus":
    lambda: brown.tagged_sents(),
    "English: Brown Corpus (simplified)":
    lambda: brown.tagged_sents(tagset="universal"),
    "English: Brown Corpus (Press, simplified)":
    lambda: brown.tagged_sents(categories=["news", "editorial", "reviews"],
                               tagset="universal"),
    "English: Brown Corpus (Religion, simplified)":
    lambda: brown.tagged_sents(categories="religion", tagset="universal"),
    "English: Brown Corpus (Learned, simplified)":
    lambda: brown.tagged_sents(categories="learned", tagset="universal"),
    "English: Brown Corpus (Science Fiction, simplified)":
    lambda: brown.tagged_sents(categories="science_fiction",
                               tagset="universal"),
    "English: Brown Corpus (Romance, simplified)":
Esempio n. 19
0
BOUNDARY = r'\b'

CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
    'Catalan: CESS-CAT Corpus (simplified)':
    lambda: cess_cat.tagged_sents(simplify_tags=True),
    'English: Brown Corpus':
    lambda: brown.tagged_sents(),
    'English: Brown Corpus (simplified)':
    lambda: brown.tagged_sents(simplify_tags=True),
    'English: Brown Corpus (Press, simplified)':
    lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'],
                               simplify_tags=True),
    'English: Brown Corpus (Religion, simplified)':
    lambda: brown.tagged_sents(categories='religion', simplify_tags=True),
    'English: Brown Corpus (Learned, simplified)':
    lambda: brown.tagged_sents(categories='learned', simplify_tags=True),
    'English: Brown Corpus (Science Fiction, simplified)':
    lambda: brown.tagged_sents(categories='science_fiction',
                               simplify_tags=True),
    'English: Brown Corpus (Romance, simplified)':