Example #1
0
	def __init__(self,mwe=True):
		self.mwe = mwe
		# Train tagger if it's used for the first time.
		try:
			loadtagger('cess_unigram.tagger').tag(['estoy'])
			loadtagger('cess_bigram.tagger').tag(['estoy'])
		except IOError:
			print "*** First-time use of cess tagger ***"
			print "Training tagger ..."
			from nltk.corpus import cess_esp as cess

			cess_sents = cess.tagged_sents()
			traintag('cess',cess_sents)
			# Trains the tagger with no MWE.
			cess_nomwe = unchunk(cess.tagged_sents())
			tagged_cess_nomwe = batch_pos_tag(cess_nomwe)
			traintag('cess_nomwe',tagged_cess_nomwe)
			print
		# Load tagger.
		if self.mwe == True:
			self.uni = loadtagger('cess_unigram.tagger')
			self.bi = loadtagger('cess_bigram.tagger')
		elif self.mwe == False:
			self.uni = loadtagger('cess_nomwe_unigram.tagger')
			self.bi = loadtagger('cess_nomwe_bigram.tagger')
 def __init__(self, tsents=cess_esp.tagged_sents()):
     """
     :param tsents: list of annotated sententeces
     """
     self.__corpus = tsents
     self.__is_trained = False
     self.__tagger = None
def get_tagger(lang):
    if lang == "English":
        global eng_tagger
        if eng_tagger:
            return eng_tagger
        else:
            _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
            eng_tagger = load(_POS_TAGGER)
            return eng_tagger
    elif lang == "Spanish":
        global spa_tagger
        if spa_tagger:
            return spa_tagger
        else:
            print 111
            training = cess_esp.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            print 555
            return spa_tagger
    else:
        global cat_tagger
        if cat_tagger:
            return cat_tagger
        else:
            training = cess_cat.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return cat_tagger
Example #4
0
def annotate_pos(sentence, annotations):
    """Produce part-of-speech annotations for a source-language
    sentence."""

    global TAGGER
    if TAGGER is None:
        logging.debug('Training part-of-speech tagger..')

        if os.path.isfile(TAGGER_FILENAME):
            with open(TAGGER_FILENAME, 'r') as f:
                TAGGER = pickle.load(f)
        else:
            training_data = cess_esp.tagged_sents()

            unigram_tagger = UnigramTagger(training_data)
            bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)
            TAGGER = BetterTagger(bigram_tagger)

            with open(TAGGER_FILENAME, 'w') as f:
                pickle.dump(TAGGER, f)

    parts_of_speech = TAGGER.tag([t.lower() for t in sentence])

    annotations['pos'] = parts_of_speech
    return sentence, annotations
Example #5
0
def train_and_save_spanish_tagger():
    cess_tagged_sents=cess_esp.tagged_sents()
    tagger=nltk.UnigramTagger(cess_tagged_sents)
    fname='UnigramTagger_cess_esp.pk1'
    output=open(fname,'wb')
    dump(tagger,output,-1)
    output.close()
def get_tagger(lang):
    if lang == "English":
        global eng_tagger
        if eng_tagger:
            return eng_tagger
        else:
            _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
            eng_tagger = load(_POS_TAGGER)
            return eng_tagger
    elif lang == "Spanish":
        global spa_tagger
        if spa_tagger:
            return spa_tagger
        else:
            training = cess_esp.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,
                                                backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return spa_tagger
    else:
        global cat_tagger
        if cat_tagger:
            return cat_tagger
        else:
            training = cess_cat.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,
                                                backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return cat_tagger
Example #7
0
def ner(sent):
    cess_sents = cess.tagged_sents()

    # Train the unigram tagger
    # uni_tag = ut(cess_sents)

    # Tagger reads a list of tokens.
    # uni_tag.tag(sent.split(" "))

    # Split corpus into training and testing set.
    # train = int(len(cess_sents)*90/100) # 90%

    # Train a bigram tagger with only training data.
    # bi_tag = bt(cess_sents[:train])

    # Evaluates on testing data remaining 10%
    # bi_tag.evaluate(cess_sents[train+1:])

    # Using the tagger.
    # bi_tag.tag(sent.split(" "))

    res = []
    common_word = pickle.load(open('spanish_words.pkl', 'rb'))
    for word in sent.split():
        if word in common_word:
            res.append([word, common_word[word]])
    return str(res)
 def __init__(self, tsents=cess_esp.tagged_sents()):
     """
     :param tsents: list of annotated sententeces
     """
     self.__corpus = tsents
     self.__is_trained = False
     self.__tagger = None
Example #9
0
def run(train, test, language, answer):
    results = {}
    if language == 'English':
        _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tagger = load(_POS_TAGGER)
    elif language == 'Spanish':
        tagger = ut(cess_esp.tagged_sents())
    elif language == 'Catalan':
        tagger  = ut(cess_cat.tagged_sents())

    for lexelt in train:

        train_features, y_train = extract_features(train[lexelt],language,tagger)
        test_features, _ = extract_features(test[lexelt],language,tagger)

        X_train, X_test = vectorize(train_features,test_features)
        X_train_new, X_test_new = feature_selection(X_train, X_test,y_train)
        results[lexelt] = classify(X_train_new, X_test_new,y_train)
    """
    B1.c
    for lexelt in train:
        features = getBestWords(train[lexelt], 30)
        train_features = countFeature(features, train[lexelt])
        _, y_train = extract_features(train[lexelt], language)
        test_features = countFeature(features, test[lexelt])

        X_train, X_test = vectorize(train_features, test_features)
        results[lexelt] = classify(X_train, X_test, y_train)
    B1.c
    """
    A.print_results(results, answer)
Example #10
0
 def __init__(self):
     print "Training POS tagger..."
     training = []
     sents = cess_esp.tagged_sents()
     for i in range(len(sents)):
         training.append(sents[i])
     self.tagger = nltk.tag.hmm.HiddenMarkovModelTagger.train(training)
     print "--Training complete--"
 def __get_train_corpus(self):
     # TODO buscar un corpus de datos más completo
     sentences = cess.tagged_sents()
     print("Longitud sentencias: ", len(sentences)) if self.v >= 2 else None
     print("Sentencias: ", sentences) if self.v >= 3 else None
     tam = round(len(sentences) * .8)
     self.train_data = sentences[:tam]
     self.test_data = sentences[tam:]
Example #12
0
File: B.py Project: keyu-lai/NLP
    def Spanish_tagger():
        import nltk
        from nltk.corpus import cess_esp

        training = cess_esp.tagged_sents()
        default_tagger = nltk.DefaultTagger("NOUN")
        bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
        trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
        return trigram_tagger
Example #13
0
    def __init__(self, train_percent_size=1):
        """

        :param train_percent_size: 0-1
        :return:
        """
        spanish_sents = spa_corpus.tagged_sents()
        subset = subset_from_corpus(spanish_sents, train_percent_size)
        self._tagger = trained_tagger_with_corpus(subset)
Example #14
0
def train_and_save_unigram_tagger(fname):
    #train nltk.UnigramTagger using
    #tagged sentences from cess_esp
    cess_tagged_sents = cess_esp.tagged_sents()
    tagger = nltk.UnigramTagger(cess_tagged_sents)

    #save the trained tagger in a file
    output = open(fname, 'wb')
    dump(tagger, output, -1)
    output.close()
Example #15
0
def tonkenier(text):
    cess_sents = cess.tagged_sents()
    uni_tag = ut(cess_sents)
    words = text.replace(",",
                         "").replace(".",
                                     "").replace("\n",
                                                 "").replace("\t",
                                                             "").split(" ")
    annotated_text = uni_tag.tag(words)
    return annotated_text
Example #16
0
 def __init__(self, use_mwe=False):
     self.use_mwe = use_mwe
     # Train tagger if it's used for the first time.
     try:
         load_tagger('cess_unigram.tagger').tag(['estoy'])
         load_tagger('cess_bigram.tagger').tag(['estoy'])
     except IOError:
         print("*** First-time use of cess tagger ***", file=sys.stderr)
         print("Training tagger ...", file=sys.stderr)
         # Load CESS corpus.
         cess_sents = cess.tagged_sents()
         train_tagger('cess', cess_sents)
         # Trains the tagger with no MWE.
         cess_nomwe = unchunk(cess.tagged_sents())
         tagged_cess_nomwe = pos_tag_sents(cess_nomwe, False)
         train_tagger('cess_nomwe', tagged_cess_nomwe)
     # Load tagger.
     _mwe_option_name = "_nomwe_" if self.use_mwe == True else "_"
     self.uni = load_tagger('cess{}unigram.tagger'.format(_mwe_option_name))
     self.bi = load_tagger('cess{}bigram.tagger'.format(_mwe_option_name))
Example #17
0
    def handling_negation(self, row):
        #Tokenize the row
        words = word_tokenize(row)

        # Read the corpus into a list,
        # each entry in the list is one sentence.
        tagged_cess_sents = cess_esp.tagged_sents()
        # Train the unigram tagger
        uni_tag = ut(tagged_cess_sents)
        # Tagger reads a list of tokens.
        #uni_tag.tag(words)
        print("WORDS: ", words)
        print("TAGGGGS: ", uni_tag.tag(words))
        '''
        # Split corpus into training and testing set.
        train = int(len(tagged_cess_sents)*90/100)
        # Train a bigram tagger with only training data
        bi_tag = bt(tagged_cess_sents[:train], backoff=uni_tag)
        # Evaluates on testing data remaining 10%
        bi_tag.evaluate(tagged_cess_sents[train+1:])
        # Using the tagger.
        #bi_tag.tag(row)
        print("TAGGGGS: ",bi_tag.tag(row))
        '''
        speach_tags = [
            'JJ', 'JJR', 'JJS', 'NN', 'VB', 'VBD', 'VBG', 'VBN', 'VBP'
        ]
        #We obtain the type of words that we have in the text, we use the pos_tag function
        tags = nltk.pos_tag(words)
        print("WORDS: ", words)
        print("TAGS: ", tags)
        #Now we ask if we found a negation in the words
        tags_2 = ''

        if "no" in words:
            tags_2 = tags[words.index("no"):]
            words_2 = words[words.index("no"):]
            words = words[:words.index("no")]

        print("tags_2 ", tags_2)
        print("words_2 ", words_2)
        print("words ", words)

        for index, word_tag in enumerate(tags_2):
            print("index ", index)
            if word_tag[1] in speach_tags:
                print("REPLACE", word_tag[0])
                words = words + [replace_antonyms(word_tag[0])
                                 ] + words_2[index + 2:]
                #break
        #print("WORDS: ",words)
        print("FINAL TAGS2: ", tags_2)
        print("FINAL WORDS: ", ' '.join(words))
        return ' '.join(words)
Example #18
0
def get_tagger():
    global TAGGER
    if TAGGER is None:
        # TODO: Load tagger from pickled form
        training_data = cess_esp.tagged_sents()
        unigram_tagger = UnigramTagger(training_data)
        bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)

        TAGGER = bigram_tagger

    return TAGGER
Example #19
0
def generateTagger():      
    default_tagger=nltk.DefaultTagger('V')
    patterns=[  (r'.*o$', 'NMS'), # noun masculine singular
                (r'.*os$', 'NMP'), # noun masculine plural
                (r'.*a$', 'NFS'),  # noun feminine singular
                (r'.*as$', 'NFP')  # noun feminine plural
            ]
    regexp_tagger=nltk.RegexpTagger(patterns, backoff=default_tagger)
    cess_tagged_sents=cess_esp.tagged_sents()
    combined_tagger=nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)
    
    return combined_tagger
Example #20
0
def main():
    
    # Read the corpus into a list, 
    # each entry in the list is one sentence.
    cess_sents = cess_esp.tagged_sents()

    # Train the unigram tagger
    uni_tag = ut(cess_sents)
    
    output = open('uni_tag.pkl', 'wb')
    dump(uni_tag, output, -1)
    output.close()
Example #21
0
	def __init__(self):
		if os.path.exists('tagger_spanish.pickle'):
			with open('tagger_spanish.pickle', 'r') as file_obj:
			    self.tagger = pickle.load(file_obj)
		else:
			print 'tagger_spanish.pickle not found. Training tagger... may take a few minutes...'
			from nltk import UnigramTagger, BigramTagger, TrigramTagger
			from nltk.corpus import cess_esp
			sents = cess_esp.tagged_sents()
			unigram_tagger = UnigramTagger(sents)
			bigram_tagger = BigramTagger(sents, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word
			self.tagger = unigram_tagger
			with open('tagger_spanish.pickle', 'w') as file_obj:
			    pickle.dump(self.tagger, file_obj)		# Dump trained tagger
Example #22
0
	def __init__(self):
		if os.path.exists('tagger_spanish.pickle'):
			with open('tagger_spanish.pickle', 'r') as file_obj:
			    self.tagger = pickle.load(file_obj)
		else:
			print 'tagger_spanish.pickle not found. Training tagger... may take a few minutes...'
			from nltk import UnigramTagger, BigramTagger, TrigramTagger
			from nltk.corpus import cess_esp
			sents = cess_esp.tagged_sents()
			unigram_tagger = UnigramTagger(sents)
			bigram_tagger = BigramTagger(sents, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word
			self.tagger = unigram_tagger
			with open('tagger_spanish.pickle', 'w') as file_obj:
			    pickle.dump(self.tagger, file_obj)		# Dump trained tagger
Example #23
0
def make_and_save_combined_tagger(fname):
    default_tagger = nltk.DefaultTagger('v')
    patterns = [ (r'.*o$', 'n'),   # noun masculine singular
               	 (r'.*os$', 'n'),  # noun masculine plural
                 (r'.*a$', 'n'),   # noun feminine singular
                 (r'.*as$', 'n')   # noun feminine singular
               ]
    regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
    cess_tagged_sents = cess_esp.tagged_sents()
    combined_tagger = nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)
    
    output = open(fname, 'wb')
    dump(combined_tagger, output, -1)
    output.close()
 def __init__(self, mwe=True):
     self.mwe = mwe
     # Train tagger if it's used for the first time.
     try:
         loadtagger('cess_unigram.tagger').tag(['estoy'])
         loadtagger('cess_bigram.tagger').tag(['estoy'])
     except IOError:
         print "*** First-time use of cess tagger ***"
         print "Training tagger ..."
         from nltk.corpus import cess_esp as cess
         cess_sents = cess.tagged_sents()
         traintag('cess', cess_sents)
         # Trains the tagger with no MWE.
         cess_nomwe = unchunk(cess.tagged_sents())
         tagged_cess_nomwe = batch_pos_tag(cess_nomwe)
         traintag('cess_nomwe', tagged_cess_nomwe)
         print
     # Load tagger.
     if self.mwe == True:
         self.uni = loadtagger('cess_unigram.tagger')
         self.bi = loadtagger('cess_bigram.tagger')
     elif self.mwe == False:
         self.uni = loadtagger('cess_nomwe_unigram.tagger')
         self.bi = loadtagger('cess_nomwe_bigram.tagger')
Example #25
0
def generate_tagger(route):
    import nltk
    from nltk.corpus import cess_esp
    patterns = [
        (r".*o$", "NMS"),
        (r".*os$", "NMP"),
        (r".*a$", "NFS"),
        (r".*as$", "NFP"),
    ]
    cesp_tsents = cess_esp.tagged_sents()
    td = nltk.DefaultTagger("s")
    tr = nltk.RegexpTagger(patterns, backoff=td)
    tu = nltk.UnigramTagger(cesp_tsents, backoff=tr)
    output = open(route + 'tagger.pkl', 'wb')
    dump(tu, output, -1)
    output.close()
def make_and_save_combined_tagger(fname):
    default_tagger=nltk.DefaultTagger('V')
    patterns=[ (r'.*o$', 'NMS'), # noun masculine singular
               (r'.*os$', 'NMP'), # noun masculine plural
               (r'.*a$', 'NFS'),  # noun feminine singular
               (r'.*as$', 'NFP')  # noun feminine singular
             ]
    regexp_tagger=nltk.RegexpTagger(patterns, backoff=default_tagger)
    #train nltk.UnigramTagger using tagged sentences from cess_esp 
    cess_tagged_sents=cess_esp.tagged_sents()
    combined_tagger=nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)
    
    #save the trained tagger in a file
    output=open(fname, 'wb')
    dump(combined_tagger, output, -1)
    output.close()
Example #27
0
 def printSpanishTags(self):
     sents = cess_esp.tagged_sents()
     tagger = HiddenMarkovModelTagger.train(sents)
     
     fullCorpus = self.fullCorpus()
     tagsDictionary = dict()
     for line in fullCorpus:
         spanishSentence = line[0]
         spanishTokens = re.compile('\W+', re.UNICODE).split(unicode(spanishSentence, 'utf-8'))
         tags = tagger.tag(spanishTokens)
         for idx, token in enumerate(spanishTokens):
             if (len(token) > 0):
                 tag = tags[idx][1]
                 sys.stdout.write(token.encode('utf-8'))
                 sys.stdout.write(":")
                 sys.stdout.write(tag)
                 sys.stdout.write("\n")
Example #28
0
def set_tagger(language):
    if language == 'English':
        _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tagger = load(_POS_TAGGER)
    elif language == 'Catalan':
        training = cess_cat.tagged_sents()
        default_tagger = nltk.DefaultTagger('NN')
        unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
        bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
        tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
    elif language == 'Spanish':
        training = cess_esp.tagged_sents()
        default_tagger = nltk.DefaultTagger('NN')
        unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
        bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
        tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    return tagger
Example #29
0
def generateTagger(clean_vocabulary):      
    fname = 'combined_taggerP.pkl'
    default_tagger = nltk.DefaultTagger('V')
    patterns=[  (r'.*o$', 'NMS'), # noun masculine singular
                (r'.*os$', 'NMP'), # noun masculine plural
                (r'.*a$', 'NFS'),  # noun feminine singular
                (r'.*as$', 'NFP')  # noun feminine plural
            ]
    regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
    cess_tagged_sents = cess_esp.tagged_sents()
    combined_tagger = nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)
    
    s_tagged = combined_tagger.tag(clean_vocabulary)
    output = open(fname, 'wb')
    dump(s_tagged, output, -1)
    output.close()

    return s_tagged
Example #30
0
def init_tagger_chunkers():
    # Leer corpus
    print("Leyendo cess_esp ...")
    sents = cess_esp.tagged_sents()
    corpus = utils.read_corpus(CORPUS_PATH)
    train, test = utils.split_to_train_test_set(corpus)

    # Entrenar tagger
    print("Entrenando tagger ...")
    tagger = Tagger.Tagger(TAGGER, sents)

    # Definir gramática de cantidad y comida
    print("Inicializando regex chunker ...")
    grammar = {}
    grammar[INGREDIENTE] = INGREDIENTE_GRAMMAR
    grammar[COMIDA] = COMIDA_GRAMMAR
    grammar[CANTIDAD] = CANTIDAD_GRAMMAR
    grammar[PEDIDO] = PEDIDO_GRAMMAR
    regex_chunker = RegexChunker.RegexChunker(grammar)

    # Guardar iob tags
    utils.write_iob_tags(IOB_INPUT_FILE, corpus, tagger, regex_chunker)

    # Leer iob tags
    train_data, test_data = utils.read_iob_tag(IOB_INPUT_FILE)

    print("Inicializando unigram chunker ...")
    unigram_chunker = NGramChunker.Chunker(train_data, TO_DETECT_LIST)
    print("Inicializando bigram chunker ...")
    bigram_chunker = NGramChunker.Chunker(train_data, TO_DETECT_LIST, 2)
    print("Inicializando trigram chunker ...")
    trigram_chunker = NGramChunker.Chunker(train_data, TO_DETECT_LIST, 3)
    print("Inicializando bayes chunker ...")
    bayes_chunker = ConsecutiveNPChunker.ConsecutiveNPChunker(
        train_data, TO_DETECT_LIST)

    chunkers = {
        REGEX: regex_chunker,
        UNIGRAM: unigram_chunker,
        BIGRAM: bigram_chunker,
        TRIGRAM: trigram_chunker,
        BAYES: bayes_chunker
    }
    return tagger, chunkers
Example #31
0
 def __init__(self):
     """
     Initializes the tagger object
     """
     # nltk.download("cess_esp")
     # Read the corpus into a list, each entry in the list is one sentence.
     self.cess_sents = cess.tagged_sents()
     self.tagger_type = NLTK_TAGGER_NAME
     # This part is done only once, and we don't have to run it again.
     #   We keep the code here to understand how we created the model.
     # Train the unigram tagger
     # tagger = ut(self.cess_sents)
     # Store the trained tagger to a file
     # f = open('nltk_spanish_tagger.pickle', 'wb')
     # pickle.dump(tagger, f)
     # f.close()
     # Open the already trained model
     with open('nltk_spanish_tagger.pickle', 'rb') as f:
         # Load the tagger
         self.tagger = pickle.load(f)
Example #32
0
	def __init__(self):
		cess_sents = cess.tagged_sents()
		self.uni_tag = ut(cess_sents)

		self.model = NgramModel(3, brown.words())

		self.translation = []
		self.dictionary = collections.defaultdict(lambda: 0)
		dictionaryFile = open("../corpus/Dictionary.txt", 'r')
		for translation in dictionaryFile:
			spanish, english = translation.split(" - ")
			spanish = spanish.decode('utf-8')
			self.dictionary[spanish] = collections.defaultdict(lambda: [])
			english = english.rstrip(';\n').split('; ')
			for pos in english:
				pos = pos.split(': ')
				self.dictionary[spanish][pos[0]] = pos[1].split(', ')

		self.sentences = []
		sentencesFile = open("../corpus/TestSet.txt", 'r')
		for sentence in sentencesFile:
			self.sentences.append(sentence.rstrip('\n'))
Example #33
0
    def __init__(self):
        cess_sents = cess.tagged_sents()
        self.uni_tag = ut(cess_sents)

        self.model = NgramModel(3, brown.words())

        self.translation = []
        self.dictionary = collections.defaultdict(lambda: 0)
        dictionaryFile = open("../corpus/Dictionary.txt", 'r')
        for translation in dictionaryFile:
            spanish, english = translation.split(" - ")
            spanish = spanish.decode('utf-8')
            self.dictionary[spanish] = collections.defaultdict(lambda: [])
            english = english.rstrip(';\n').split('; ')
            for pos in english:
                pos = pos.split(': ')
                self.dictionary[spanish][pos[0]] = pos[1].split(', ')

        self.sentences = []
        sentencesFile = open("../corpus/TestSet.txt", 'r')
        for sentence in sentencesFile:
            self.sentences.append(sentence.rstrip('\n'))
Example #34
0
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(simplify_tags=True),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(simplify_tags=True),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(simplify_tags=True),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(simplify_tags=True),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
    _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
    _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'


    #Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT=0.30
# back to our original example, one phrase is more dense than the other.

# <codecell>

assert density("en", nltk.pos_tag(tokenize("If I were you I wouldn’t do that with these."))) \
       < density("en", nltk.pos_tag(tokenize("The quick brown fox jumps over the lazy dog.")))

# <markdowncell>

# Tag Spanish Text
# ================

# <codecell>

from nltk.corpus import cess_esp
sents = cess_esp.tagged_sents()

# <markdowncell>

# Split into training and test set

# <codecell>

training_dx = int(len(sents)*90/100)
training = sents[:training_dx]
test = sents[training_dx+1:]

# <markdowncell>

# train tagger and check accuracy (this takes 40 seconds or so) ...
Example #36
0
import nltk
from nltk.corpus import cess_esp
from pickle import dump

patterns = [ (r".*o$","NMS"),
               (r".*os$","NMP"),
               (r".*a$","NFS"),
               (r".*as$","NFP"),
             ]
cesp_tsents = cess_esp.tagged_sents()
td = nltk.DefaultTagger("s")
tr = nltk.RegexpTagger(patterns, backoff = td )
tu = nltk.UnigramTagger(cesp_tsents, backoff = tr )
output = open("tagger.pkl","wb")
dump(tu,output,-1)
output.close()
Example #37
0
from nltk.tokenize import word_tokenize

#adaptacion SPANISH
from nltk.corpus import cess_esp
nltk.tag.mapping._load_universal_map("es-cast3lb")
mapdict = nltk.tag.mapping._MAPPINGS["es-cast3lb"]["universal"]
alltags = set(t for w, t in cess_esp.tagged_words())
for tag in alltags:
    if len(tag) <= 2:  # These are complete
        continue
    mapdict[tag] = mapdict[tag[:2]]

cess_esp._tagset = "es-cast3lb"
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
cess_sents = cess_esp.tagged_sents(tagset='universal')
uni_tag = ut(cess_sents, backoff=nltk.DefaultTagger('X'))


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
Example #38
0
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(simplify_tags=True),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(simplify_tags=True),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(simplify_tags=True),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(simplify_tags=True),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = '#F00'  #red
    _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0'  # dark grey
    _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG'

    #Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT = 0.30
Example #39
0
import A
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from nltk import word_tokenize
from nltk.corpus import cess_esp
from nltk.corpus import cess_cat
from nltk.data import load
from sklearn import svm
import nltk
from nltk import UnigramTagger as ut

tagger_cat  = ut(cess_cat.tagged_sents())
tagger_esp = ut(cess_esp.tagged_sents())
# You might change the window size
window_size = 15

def b1_base(data):
    '''
    :param data: list of instances for a given lexelt with the following structure:
        {
                        [(instance_id, left_context, head, right_context, sense_id), ...]
        }
    :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
    :return: vectors: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }
            labels: A dictionary with the following structure
            { instance_id : sense_id }

    '''
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(tagset='simple'),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(tagset='simple'),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(tagset='simple'),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(tagset='simple'),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = '#F00'  #red
    _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0'  # dark grey
    _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG'

    #Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT = 0.30
Example #41
0
 def get_tagged_sentences(self):
     return cess_esp.tagged_sents()
Example #42
0
from nltk.corpus import cess_esp as cess
from nltk import RegexpTokenizer
import nltk
import pickle

# My sentences
sentence = "hola, hola, soy Pedro ¿como te llamas?."
tokenizer = RegexpTokenizer(r'\w+')
tokenized_words = tokenizer.tokenize(sentence)

# Dec train/test
train = None
test = None
cess_sents = cess.tagged_sents()
try:
    with open('test_pickles/test_data.pickle', 'rb') as fa:
        div = pickle.load(fa)
        train = cess_sents[:div]
        test = cess_sents[div+1:]
except FileNotFoundError as a:
    # training data
    print("dumping train/test")
    div = len(cess_sents)*90//100
    train = cess_sents[:div]
    test = cess_sents[div+1:]

    with open('test_pickles/test_data.pickle', 'wb') as fb:
        pickle.dump(div, fb)

#####
#
Example #43
0
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='simple'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='simple'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='simple'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='simple'),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
    _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
    _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'


    #Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT=0.30
Example #44
0
def init():
    cess_sents = cess.tagged_sents()
    unitag = ut(cess_sents)
    pass
Example #45
0
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"

    _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0"  # dark grey
    _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"

    # Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT = 0.30
#! /usr/bin/python

# -*- coding: utf-8 -*-

from nltk.corpus import cess_esp, brown
from nltk import UnigramTagger, BigramTagger
from cPickle import dump

# read corpus
corpusEs = cess_esp.tagged_sents()
corpusEn = brown.tagged_sents()

# Train the unigram taggers
uniTagEs = UnigramTagger(corpusEs)
uniTagEn = UnigramTagger(corpusEn)

# write out files
outputEs = open('uniTag.es.pkl', 'wb')
outputEn = open('uniTag.en.pkl', 'wb')
dump(uniTagEs, outputEs, -1)
dump(uniTagEn, outputEn, -1)
outputEs.close()
outputEn.close()
Example #47
0
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(tagset='universal'),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(tagset='universal'),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(tagset='universal'),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(tagset='universal'),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = '#F00'  #red
    _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0'  # dark grey
    _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG'

    #Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT = 0.30
Example #48
0
# POS-Tagging (en ingés, no funciona en español)
# tagged = nltk.pos_tag(tokens)
# print "Tagged:"
# for t in tagged:
#     print "\t", "\t".join(t)

# POS-Tagging sencillo (en español)
# Para poder taggear en español necesitamos primero entrenar
# al tagger con un corpus de frases ya calificadas.
from nltk.corpus import cess_esp as cess  # El corpus
from nltk import UnigramTagger as ut  # El tagger (de una palabra)
from nltk import BigramTagger as bt  # El tagger (de a dos palabras)

# Leemos el corpus a una lista
# Cada entrada de la lista es una frase
cess_sents = cess.tagged_sents()

# Dividimos el corpus en dos partes: entrenamiento y testeo
train = int(len(cess_sents) * 90 / 100)  # 90% entrenamiento

import pickle
crear_taggers = True
if crear_taggers:
    # Entrenamos el tagger de unigramas (no tiene caso el testeo con unigramas)
    uni_tag = ut(cess_sents)
    # Entramos el tagger de bigramas con sólamente la data de entrenamiento
    bi_tag = bt(cess_sents[:train])

    # Guardamos los taggers en archivos para ahorrar tiempo la siguiente vez
    with open('test/cess_unigram.tagger.pkl', 'wb') as output:
        pickle.dump(uni_tag, output, pickle.HIGHEST_PROTOCOL)
Example #49
0
from nltk.tag.stanford import POSTagger		# For French and German

import json									# To be able to read json files
import urllib2								# To be able to read url (speech) containing json

import collections							# The collection module has useful functions to count automatically frequencies
from collections import Counter             # Module useful for frequencies in a list

import MySQLdb 								# Module for Mysql database
import datetime 							# Module to use date and time 
from config import DB_USER,DB_PWD,DB_NAME   # Database Settings  

languages = ["ar","de","en","es","fr","it","ko","no","pt","sv","zh"] 				   	# languages supported by this parser
punctuation = [",",";",".",":","!","?","(",")","-","%","\"","[","]"]                    # all marks of punctuation
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')    					# Loading English file to detect sentences
cess_sents = cess.tagged_sents()									 					# Spanish sentences
uni_tag = ut(cess_sents)											 					# Tagging spanish sentences
st_tag_english = NERTagger('english.all.3class.distsim.crf.ser.gz','stanford-ner.jar') 	# Entities for English
st_pos_french = POSTagger('french.tagger', 'stanford-postagger.jar') 				   	# French Grammar
st_pos_german = POSTagger('german-fast.tagger', 'stanford-postagger.jar') 			   	# German Grammar
st_tag_german = NERTagger('hgc_175m_600.crf.ser.gz','stanford-ner.jar')                	# Entities for German
chunker = nltk.data.load('chunkers/maxent_ne_chunker/english_ace_multiclass.pickle') 	# Loads the Chunk Parser
maxEnt = chunker._tagger.classifier()                                                	# The tag classifier for entities


# Please note that at the moment English, French, German and Sapnish are the only languages supported by this program
# It is slower in German and better in English: in English it is possible to define a probability for entities (with the Chunk Parser)
#--------------------------------------------------------------------------------------------------------------------------------------


# Global variables
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='universal'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='universal'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='universal'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='universal'),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
    _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
    _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'


    #Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT=0.30
Example #51
0
def tonkenier(text):
    cess_sents = cess.tagged_sents()
    uni_tag = ut(cess_sents)
    words = text.replace(",", "").replace(".", "").replace("\n", "").replace("\t", "").split(" ")
    annotated_text = uni_tag.tag(words)
    return annotated_text
        elif tag2.startswith('a'):
            number2, gender2 = self.number_from_adjective(
                tag2), self.gender_from_adjective(tag2)
        else:
            return True

        return number1 == number2 and (gender1 == "i" or gender2 == "i"
                                       or gender1 == gender2)

    def number_from_verb(self, tag):
        return tag[5]

    def gender_from_verb(self, tag):
        return tag[2]

    def number_from_pronom(self, tag):
        return tag[4]

    def gender_from_pronom(self, tag):
        return tag[3]

    def number_from_adjective(self, tag):
        return tag[4]

    def gender_from_adjective(self, tag):
        return tag[3]


if __name__ == "__main__":
    tagger = cess_esp.tagged_sents()
Example #53
0
from clean_tokens import *
from mutual_information import *
import nltk
from write import writeList
from nltk.corpus import cess_esp
from mutual_information import getSentences


def tag_spanish_sentence(sentence, tagger):
    tokens = nltk.word_tokenize(sentence)
    s_tagged = tagger.tag(tokens)
    return s_tagged


if __name__ == '__main__':
    """obteniendo el texto para tokenizar por oraciones"""
    """
    fname='C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm'
    text_string=get_text_string(fname)
    sentences = getSentences(text_string)
    sent = sentences[12]
    #print(type(sent))
    text = nltk.word_tokenize(sent)
    cad = nltk.Text(text)
    print(nltk.pos_tag(cad))
    """
    tagget_sents = cess_esp.tagged_sents()
    #print(tagget_sents)
    tagger = nltk.UnigramTagger(tagget_sents)
    #tagger.load(input)
Example #54
0
    "Dutch: Alpino Corpus (simplified)":
    lambda: alpino.tagged_sents(tagset="universal"),
    "Hindi: Indian Languages Corpus":
    lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)":
    lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"),
    "Portuguese: Floresta Corpus (Portugal)":
    lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)":
    lambda: floresta.tagged_sents(tagset="universal"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)":
    lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)":
    lambda: mac_morpho.tagged_sents(tagset="universal"),
    "Spanish: CESS-ESP Corpus (simplified)":
    lambda: cess_esp.tagged_sents(tagset="universal"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"

    _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0"  # dark grey
    _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"

    # Percentage of text left of the scrollbar position
    _FRACTION_LEFT_TEXT = 0.30
Example #55
0
 def __init__(self):
     cess_sents = cess.tagged_sents()
     self.uni_tag = ut(cess_sents)
     train = int(len(cess_sents)*90/100)  # 90%
     self.bi_tag = bt(cess_sents[:train])
     self.bi_tag.evaluate(cess_sents[train+1:])