Example #1
0
    def tag_sents(self, sents):
        '''
        Tag a list of sentences. NB before using this function, user should specify the mode_file either by 
                       - Train a new model using ``train'' function 
                       - Use the pre-trained model which is set via ``set_model_file'' function  
        :params sentences : list of sentences needed to tag. 
        :type sentences : list(list(str))
        :return : list of tagged sentences. 
        :rtype : list (list (tuple(str,str))) 
        '''
        if self._model_file == '':
            raise Exception(' No model file is found !! Please use train or set_model_file function')
        
        # We need the list of sentences instead of the list generator for matching the input and output

################ added by Kathrin #########################################
        default = DefaultTagger('None')
        sents = default.tag_sents(sents)
###########################################################################
        result = []  
        for tokens in sents:
            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
            labels = self._tagger.tag(features)
                
            if len(labels) != len(tokens):
                raise Exception(' Predicted Length Not Matched, Expect Errors !')
############### added by Kathrin ############################################
            tokens = [i[0] for i in tokens]
#############################################################################
            tagged_sent = list(zip(tokens,labels))
            result.append(tagged_sent)
            
        return result 
class Chunker:

	_tagger = DefaultTagger
	
	def __init__(self, words, sents):
		self._tagger = DefaultTagger('NN')
		self.tag_words(words, sents)

	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)

	def get_accuracy(self, sentences=[]):

		if sentences == []:
			test_sents = treebank.tagged_sents()[6000:]
		else:
			test_sents = sentences
		print self._tagger.evaluate(test_sents)
Example #3
0
def getDefaultTaggerAccuracy(testingSet):
    # gets the accuracy of the DefaultTagger

    # get untagged sentences and gold POS tags
    untaggedSentences = [[taggedWord[0] for taggedWord in sentence]
                         for sentence in testingSet]
    goldPOSTags = [[taggedWord[1] for taggedWord in sentence]
                   for sentence in testingSet]

    # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this
    # altogether
    # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary
    defaultTagger = DefaultTagger('NN')
    defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences)

    # calculate accuracy
    totalTags = 0
    matches = 0
    # iterate through sentences
    for sentencePOSTags in goldPOSTags:
        # iterate through tags
        for individualPOSTag in sentencePOSTags:
            totalTags += 1
            # if the gold tag is NN, then match
            if individualPOSTag == 'NN':
                matches += 1

    accuracy = (matches / totalTags) * 100
    return accuracy
Example #4
0
def getDefaultTaggerAccuracy(testingSet):
    # gets the accuracy of the DefaultTagger

    # get untagged sentences and gold POS tags
    untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet]
    goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet]

    # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this
    # altogether
    # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary
    defaultTagger = DefaultTagger("NN")
    defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences)

    # calculate accuracy
    totalTags = 0
    matches = 0
    # iterate through sentences
    for sentencePOSTags in goldPOSTags:
        # iterate through tags
        for individualPOSTag in sentencePOSTags:
            totalTags += 1
            # if the gold tag is NN, then match
            if individualPOSTag == "NN":
                matches += 1

    accuracy = (matches / totalTags) * 100
    return accuracy
Example #5
0
 def test_default_tagger(self):
     test_list = make_sentence_list(path.join(self.test_dir, 'test.tsv'))
     tagger = DefaultTagger('N')
     split = int(len(test_list) * .90)
     train_data = test_list[:split]
     test_data = test_list[split:]
     print(tagger.evaluate(train_data))
     print(tagger.evaluate(test_data))
Example #6
0
def find_accuracy(train_set, test_set):
    #skal alt her være test-set?
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    train_set_most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(train_set_most_frequent_tag)
    accuracy_result = default_tagger.evaluate(test_set)
    return accuracy_result
Example #7
0
def evaluate_nltk_pos_taggers(gold_standard_filename, num_folds=10, loo=False):
    """
    Evaluates the NLTK backoff taggers on the corpus data. Uses cross-validation.
    :param gold_standard_filename: tsv file of format: word \t POS \n
    :param num_folds: int: number of folds for cross-validation
    :param loo: bool: whether to use Leave One Out cross-validation
    :return:
    """
    tagged_sents = make_sentence_list(gold_standard_filename)
    backoff = DefaultTagger('N')
    tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]
    scores = {
        'DefaultTagger': [],
        'UnigramTagger': [],
        'BigramTagger': [],
        'TrigramTagger': [],
        'BrillTagger': [],
    }

    # k-fold cross-validation
    if loo:  # Leave One Out cross-validation
        num_folds = len(tagged_sents)-1
    subset_size = int(len(tagged_sents) / num_folds)
    for i in range(num_folds):

        # training and testing data for this round
        X_test = tagged_sents[i * subset_size:][:subset_size]
        X_train = tagged_sents[:i * subset_size] + tagged_sents[(i + 1) * subset_size:]

        # compute score for taggers
        default_score = backoff.evaluate(X_train)
        trigram, tagger_scores = backoff_tagger(X_train, X_test,
                                                tagger_classes, backoff=backoff)
        uni_score, bi_score, tri_score = tagger_scores
        brill_tagger = train_brill_tagger(trigram, X_train)
        brill_score = brill_tagger.evaluate(X_test)
        brill_tagger.print_template_statistics(printunused=False)

        # save scores
        scores['DefaultTagger'].append(default_score)
        scores['UnigramTagger'].append(uni_score)
        scores['BigramTagger'].append(bi_score)
        scores['TrigramTagger'].append(tri_score)
        scores['BrillTagger'].append(brill_score)

    for k, v in scores.items():  # average scores across folds
        if v:
            scores[k] = sum(v)/len(v)
            print(k, ": {:2.2%}".format(scores[k]))
    return scores
Example #8
0
def find_combined_taggers_accuracy(train_set, test_set):
    # finding most used tag
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(most_frequent_tag)

    # default tagger
    default_tagger_result = default_tagger.evaluate(test_set)
    print("Default Tagger accuracy: ", default_tagger_result)

    # regex tagger
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    regex_tagger = RegexpTagger(patterns)
    regex_tagger_result = regex_tagger.evaluate(test_set)
    print("Regex Tagger Accuracy: ", regex_tagger_result)

    # unigram tagger with default tagger as backoff
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)
    unigram_tagger_result = unigram_tagger.evaluate(test_set)
    print("Unigram Tagger accuracy (Backoff = Default Tagger): ",
          unigram_tagger_result)

    # bigram tagger with different backoffs
    bigram_tagger = BigramTagger(train_set)
    bigram_tagger_backoff_unigram = BigramTagger(train_set,
                                                 backoff=unigram_tagger)
    bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger)

    bigram_tagger_result = bigram_tagger.evaluate(test_set)
    bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate(
        test_set)
    bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate(
        test_set)

    print("Bigram Tagger Accuracy: ", bigram_tagger_result)
    print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ",
          bigram_tagger_backoff_regex_result)
    print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ",
          bigram_tagger_backoff_unigram_result)
Example #9
0
def train_tagger():
    """
	This function trains the tagger
	"""
    print("Training POS tagger...")
    # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py

    tagged_sentences = treebank.tagged_sents()
    size = int(len(tagged_sentences) * 0.9)
    train_sents = tagged_sentences[:size]
    test_sents = tagged_sentences[3000:]

    default = DefaultTagger("NN")
    tagger = ClassifierBasedPOSTagger(
        train=train_sents, backoff=default, cutoff_prob=0.3
    )
    print(tagger.evaluate(test_sents))  # 0.9613641269156055

    # save model to pickle file as binary
    file_name = MODEL_PATH + "tag_model.pkl"
    with open(file_name, "wb") as fout:
        pickle.dump(tagger, fout)

    print("model written to: " + file_name)
    print("")

    return tagger
Example #10
0
    def train(self, sentence_list):
        """Trains the tagger from the tagged sentences provided
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        trigram_fallback = TrigramTagger(sentence_list,
                                         backoff=bigram_fallback)
        templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates)
        self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
Example #11
0
def train(train_sentences):
    print "- Default Tagger"
    default_tagger = DefaultTagger('NC')

    print "- Unigram Tagger"
    unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger)

    print "- Templates"
    #These templates define the features to be used for the brill tagger
    # relatively to the word position.
    Template._cleartemplates()
    templates = [
        Template(Pos([-1])),
        Template(Pos([-1]), Word([0])),
        Template(Pos([-2])),
        Template(Pos([-2]), Word([0])),
        Template(Pos([1])),
    ]
    print "- Brill Tagger"
    tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1)
    tagger = tt.train(train_sentences, max_rules=1000)

    print "- Done."

    return tagger
Example #12
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        text = self.regexTokenizer(kalimat.lower().strip())

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: 77%
        #      tanpa regextagger : > 90%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print ("Calculating Tagger Accuracy...")
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print ("Accuracy is: %s" % (self.tagAccuracy))
        
        return trigram_tagger.tag(text)
Example #13
0
def train():
    try:
        input = open('tagger.pkl', 'rb')
        print("Found tagger")

        tagger = load(input)
        input.close()
    except IOError:
        print('Training:')

        train_sents = brown.tagged_sents()[:50000]
        test_sents = brown.tagged_sents()[50000:]

        tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]

        tagger = backoff_tagger(train_sents,
                                tagger_classes,
                                backoff=DefaultTagger('unseen'))
        print('Finished training, tagger accuracy:')
        print(tagger.evaluate(test_sents))

        output = open('tagger.pkl', 'wb')
        dump(tagger, output, -1)
        output

    return tagger
Example #14
0
 def __init__(self):
     self.backoff = self.backoff_tagger(backoff=DefaultTagger('NN'))
     self.st = StanfordNERTagger(
         'stanfordNERJars/classifiers/english.all.3class.distsim.crf.ser.gz',
         'stanfordNERJars/stanford-ner.jar',
         encoding='utf-8')
     if os.path.exists("out/"):
         shutil.rmtree('out/')
Example #15
0
def ngramtagging(train): #PROSES POS TAGGING
    train_data = []
    train_data.append(train)
    backoff_tagger = DefaultTagger('nn')
    unigram_tagger = UnigramTagger(train_data, backoff = backoff_tagger)
    bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger)
    trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger)
    return trigram_tagger
def tag_words(words, tag):
    """
    Associates a tag with words.

    Parameters
    ----------
    words: A list of strings.
    tag: A str.

    Returns
    -------
    A list of tuples of (str, str)
    """

    default_tagger = DefaultTagger(tag)
    tags = default_tagger.tag(words)

    return tags
Example #17
0
def tag_words(words, tag):
    """
    Associates a tag with words.

    Parameters
    ----------
    words: A list of strings.
    tag: A str.

    Returns
    -------
    A list of tuples of (str, str)
    """

    default_tagger = DefaultTagger(tag)
    tags = default_tagger.tag(words)

    return tags
Example #18
0
def train_tagger(tagger_name):
    train_sents = treebank.tagged_sents()[:5000]
    if tagger_name == "TnT" or tagger_name == 'tagger':
        trained_tagger = tnt.TnT()
        trained_tagger.train(train_sents)
    else:
        tagger1 = DefaultTagger('NN')
        tagger2 = TrigramTagger(train_sents, backoff=tagger1)
        tagger3 = BigramTagger(train_sents, backoff=tagger2)
        trained_tagger = UnigramTagger(train_sents, backoff=tagger3)
    return trained_tagger
Example #19
0
def baseline(tagged_sentences):
    from nltk.tag import UnigramTagger
    from nltk.tag import DefaultTagger
    from collections import Counter

    # lowercase everything
    # remove all instances of non-universal tags for propper comparison with
    # the other methods
    new_tagged_sentences = []
    for sent in tagged_sentences:
        sent = [(x[0].lower(), x[1]) for x in sent]
        sent = [x for x in sent if x[1] in _UNI]
        new_tagged_sentences.append(sent)
    tagged_sentences = new_tagged_sentences

    # size of corpus
    corpus_size = sum([len(sent) for sent in tagged_sentences])
    print('Corpus size: {} docs'.format(len(tagged_sentences)))
    print('Corpus size: {} tokens'.format(corpus_size))
    
    # train/test split
    test_pct = 0.3
    test_len = int(len(tagged_sentences) * test_pct)
    test_idx = len(tagged_sentences) - test_len
    train_set = tagged_sentences[:test_idx]
    test_set = tagged_sentences[test_idx:]
    print('Train set: {} docs'.format(len(train_set)))
    print('Test set: {} docs'.format(len(test_set)))

    # calculate test set size in tokens
    test_size = sum([len(sent) for sent in test_set])
    print('Test set: {} tokens'.format(test_size))

    # calculate most comman tag in the train set
    # this should be 'NOUN'
    tag_dist = []
    for sent in train_set:
        tag_dist += [x[1] for x in sent]
    counts = Counter()
    counts.update(tag_dist)
    most_common = counts.most_common(1)[0][0]
    print('Most common tag: {}'.format(most_common))

    # Create model
    backoff = DefaultTagger(most_common)
    tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)

    # Evaluate
    acc = tagger.evaluate(test_set)
    print('Baseline: {}'.format(acc))
Example #20
0
def lexical(tokens):
    print "\n"
    print "Step 2: Lexical Analysis\n"
    print "Essentially refers to dictionary and obtains the properties of the word"
    print "Part-Of-Speech tagging"
    print "The tagset is:\n"

    tag = DefaultTagger('NN')
    tagg = UnigramTagger(train_sent, backoff=tag)
    tagger = BigramTagger(train_sent, backoff=tagg)

    tagtokens = tagger.tag(tokens)
    for token, tag in tagtokens:
        print token + "->" + tag
    print "\n"
    print "The acurracy of the trained pos tagger is:"
    print tagger.evaluate(test_sents)

    return tagtokens
    def wordTagger(self, wordlist,number):
        train_sents = treebank.tagged_sents()[:3000]
        if number==1:
            taglist = nltk.pos_tag(wordlist)
        elif number ==2:
            tagger = DefaultTagger('NN')
            taglist = tagger.tag(wordlist)
        elif number ==3:
            tagger = UnigramTagger(train_sents)
            taglist = tagger.tag(wordlist)

        elif number ==4:
            tnt_tagger = tnt.TnT()
            tnt_tagger.train(train_sents)
            taglist = tnt_tagger.tag(wordlist)
        elif number ==5:
            tagger = ClassifierBasedPOSTagger(train=train_sents)
            taglist = tagger.tag(wordlist)
        return taglist
Example #22
0
def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
Example #23
0
def train_tagger():
    '''
    Um exemplo de treinamento de um etiquetador sintático usando
    um modelo de tri-gramas baseado em probabilidades.

    Um etiquetador sintático identifica quais a classe de uma palavra
    Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N
    Preposição Verbo Artigo Substantivo
    '''

    # Carregando um conjunto de dados em português que possui
    # sentenças manualmente identificadas
    data = [
        [(w, re.split('[|-]', tag)[0]) for w, tag in sent]
        for sent in mac_morpho.tagged_sents()]

    # Classe sintática padrão. N siginifica Nome/substantivo
    tagger0 = DefaultTagger('N')
    print('train unigram')
    tagger1 = UnigramTagger(data, backoff=tagger0)
    print('training bigram')
    tagger2 = BigramTagger(data, backoff=tagger1)
    print('training trigram')
    return TrigramTagger(data, backoff=tagger2)
Example #24
0
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True)
    return trainer.train(train_sents, **kwargs)

defaultTagger = DefaultTagger('NN')
initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger)
brillTagger = train_brill_tagger(initialTagger, brown_train_sents)

tnt_tagger = tnt.TnT(N=100)
tnt_tagger.train(brown_train_sents)

bigramTagger = BigramTagger(brown_train_sents)
trigramTagger = TrigramTagger(brown_train_sents)

print("------------Recommended Tagger------------")
print(nltk.pos_tag(sent))

print("------------Default Tagger------------")
print(defaultTagger.tag(sent))
Example #25
0
def backoff_tagger(train_sents, tagger_classes):
    backoff = DefaultTagger('NN')
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff
Example #26
0
def indivDefault(bambara):
    default = DefaultTagger('n')
    print(default.evaluate(bambara.test_sents))
    return default
import nltk
from nltk.tag import DefaultTagger
tag = DefaultTagger('NN')
print(tag.tag(['Beautiful', 'morning']))

Example #28
0
from nltk.tag import tnt, DefaultTagger
import pickle

datas = open('Indonesian_Manually_Tagged_Corpus.tsv', 'r').read()
datas = datas.split('\n\n')

train_sents = []

for data in datas:
    train_sents.append(list(tuple(i.split('\t')) for i in data.split('\n')))

unk = DefaultTagger('NN')
tnt_tagger = tnt.TnT(unk=unk, Trained=True)
tnt_tagger.train(train_sents)
tagger_file = open("indonesian_tnt_pos_tag.pickle", "wb")
pickle.dump(tnt_tagger, tagger_file)
tagger_file.close()
Example #29
0
    ('BASIS', 'BASIS'),
    ('CORPORATE', 'CORPORATE'),
    ('OTHER|other', 'OTHER'),
    ('LAST|last', 'LAST'),
    (r'POS', 'POINT_OF_SALE'),
    (r'AFTERCOMPANY|CORPORATE|COMPANY', 'CORPORATE'),
    (r'DISCOUNT|COMMISSIO|COMMISSIONS|COMISSION|discount|discounts|Commissionable|commissionable',
     'DISCOUNT'),
    (r'ASIA|Asia|asia|AISA|亚洲', 'ASIA'),
    (r'NORTH|North|north', 'NORTH'),
    (r'SOUTH|South|south', 'SOUTH')

    #            ('TICKET','VALIDITY')
]  # add learning loop here for tags

def_tagger = DefaultTagger('NN')
prelim_def_tagger = DefaultTagger(None)

backoff = RegexpTagger(
    [
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'is|was|are|were', 'VBZ'),  # verb to be
        (r'"', 'QT'),  # quote
        (r'.*', 'NN')  # nouns (default)
 def setUp(self):
     self.corpus = brown.tagged_sents()[:35]
     self.decoder = JSONTaggedDecoder()
     self.encoder = JSONTaggedEncoder()
     self.default_tagger = DefaultTagger("NN")
Example #31
0
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
from nltk.corpus import wordnet as wn
from os.path import isfile, join
from os import listdir
from pprint import pprint
import gensim.downloader as api
import re
import nltk
import os

TEST_PATH = '../test/untagged'
COMMON_WORDS_PATH = '../resources/1-1000.txt'

TRAINING_SENTS = treebank.tagged_sents()
UNIGRAM = UnigramTagger(TRAINING_SENTS, backoff=DefaultTagger('NN'))
BIGRAM = BigramTagger(TRAINING_SENTS, backoff=UNIGRAM)
TRIGRAM = TrigramTagger(TRAINING_SENTS, backoff=BIGRAM)

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
WORD_VECTORS = api.load("glove-wiki-gigaword-100")
TEST_FILES = [f for f in listdir(TEST_PATH) if isfile(join(TEST_PATH, f))]

# Manual list of words to be considered "irrelevant"
IRRELEVANT_WORDS = ["talk", "seminar", "lecture"]

# manually created ontology tree, which is later extended
TREE = {"science": {}, "maths": {}, "engineering": {}, "medicine": {}}

# code to convert POS tags into the right form for lemmatization
# https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
Example #32
0
#word_tokenize('Hello World.')

# load pickled tokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
word_list = tokenizer.tokenize( sentence )

#--------------------------------------------------------------------------------
# Parts of Speech
#--------------------------------------------------------------------------------

# Default tagging
from nltk.tag import DefaultTagger

# if all else fails, make an unknown word a noun ( "NN" )
default_tagger = DefaultTagger( 'NN' )

# try it.
tagged_sentence = default_tagger.tag( word_list )

# Can also batch tag, but need a list of sentences, each already tokenized.
#tagger.batch_tag([['Hello', 'world', '.'], ['How', 'are', 'you', '?']])

#--------------------------------------------------------------------------------
# Training taggers
#--------------------------------------------------------------------------------

# so far so good.  Next have to train taggers.

# Unigram, training on Treebank corpus
from nltk.tag import UnigramTagger
Example #33
0
# every tagger has a tag() method.
# DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method.
from nltk.tag import DefaultTagger
from nltk.corpus import treebank

tagger = DefaultTagger('NN')
print(tagger.tag(['Hello', 'World']))

# thought it's too simple, we can try to evaluate it
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))

# for sentences
print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']]))

# untagging
from nltk.tag import untag

print(untag([('Hello', 'NN'), ('World', 'NN')]))
Example #34
0
import nltk
from nltk.corpus import treebank
from nltk.tag import DefaultTagger

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
#print(train_data[0])

dt = DefaultTagger('NN')
print(dt.evaluate(test_data))

nt = ClassifierBasedPOSTagger(train=train_data,
                              classifier_builder=NaiveBayesClassifier.train)
print(nt.evaluate(test_data))
Example #35
0
for page in list(root):
    l = []
    text = page.find('text').text.decode('utf8')
    language = page.find('language').text.decode('utf8')
    pos = page.find('pos_tags').text.decode('utf8')
    splitText = text.split(" ")[1:-1]
    posText = pos.split(" ")[1:-1]
    for i in range(len(splitText)):
        l.append((splitText[i], posText[i]))
    data.append(l)
    count = count + 1
shuffle(data)

# Divide data into train and test sets
eightyPercent = count*0.9
training_set = data[0:int(eightyPercent)]
test_set = data[int(eightyPercent):]

# Train
train_data = training_set
tag1 = DefaultTagger('NN')
tag2 = UnigramTagger(train_data, backoff = tag1)
tag3 = BigramTagger(train_data, backoff = tag2)
tag4 = TrigramTagger(train_data, backoff = tag3)

# Accuracy
# print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split())
# print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split())
gold_sentences = test_set
print tag4.evaluate(gold_sentences)
Example #36
0
def Tagger():
    #Tagger
    etiq1 = DefaultTagger('N')
    sentencas_treinadoras = mac_morpho.tagged_sents()[::]
    etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
    return etiq2
Example #37
0
f.write("\n".join(caps))

f.close()

#adding the tagger

import nltk

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,
                                  backoff=default,
                                  cutoff_prob=0.3)

tagger.evaluate(test_sents)

#token = nltk.word_tokenize(title)  #title string tokenized

#removing all the punctuation  marks
Example #38
0
tagged_sent = tag(sentence)
print tagged_sent


# building your own tagger

# preparing the data
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
print train_data[0]

# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')

print dt.evaluate(test_data)

print dt.tag(tokens)


# regex tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
Example #39
0
######### DEFAULT TAGGER ###############

#Assigning the default Tag
from nltk.tag import DefaultTagger, untag
tagger=DefaultTagger('NN')
tokens=[['Hello','World'],['How','are','you','?']]
print tagger.tag(tokens)

print tagger.tag_sents(tokens)

#Untagging
tagged=tagger.tag(tokens)
print untag(tagged)

#Evaluating the tagger accuracy
from nltk.corpus import treebank
test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)
Example #40
0
from nltk.metrics import *
import string
'''import replacer
from replacer import RegexpReplacer
from replacer import RepeatReplacer'''
import linecache
import matplotlib.pyplot as plt
'''
Train Tagger
'''
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.corpus import treebank
train = treebank.tagged_sents()[:10000]
t0 = DefaultTagger('NN')
t1 = UnigramTagger(train, backoff=t0)
t2 = BigramTagger(train, backoff=t1)
'''
Initialize
'''
my_corp = web.sents(fileids='firefox.txt')
sent_count = 0
ques_count = 0
All_count = 1
NN_count = 0
NNS_count = 0
NNP_count = 0
VB_count = 0
VBN_count = 0
VBG_count = 0
	def __init__(self, words, sents):
		self._tagger = DefaultTagger('NN')
		self.tag_words(words, sents)