def load_data(self, percentage):
        print("Started Loading the Data")
        # Get the complete data
        data_set = treebank.fileids()
        # Partition the data into train and test data sets
        training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)]
        testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)]

        # How much percentage of files consider for training?
        index = int(percentage*len(training_data_fileIds))
        training_data_fileIds = training_data_fileIds[:index]

        tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds)
        tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds)

        tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds)
        tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds)

        # print(len(tagged_training_data1), len(tagged_testing_data1))

        # UnTag the data for other uses
        untagged_training_data = [untag(item) for item in tagged_training_data]
        untagged_testing_data = [untag(item) for item in tagged_testing_data]

        print("Data Loaded Successfully. Stats are")
        print("Training Data Sentences: ", len(tagged_training_data))
        print("Testing Data  Sentences: ", len(tagged_testing_data))

        return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]
    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]
    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)
    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so",
        "slow", "!"])
    print "\n\n"
    print "show the 10 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(10)
Example #3
0
def demo3():
    from nltk.corpus import treebank, brown

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i * d10) : ((i + 1) * d10)]
        etest = e[(i * e10) : ((i + 1) * e10)]

        dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
        etrain = e[: (i * e10)] + e[((i + 1) * e10) :]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.evaluate(dtest)
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.evaluate(etest)
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += tacc / tp_kn
        sknacc += sacc / tp_kn
        tallacc += tacc
        sallacc += sacc

        # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown)
	def get_accuracy(self, sentences=[]):

		if sentences == []:
			test_sents = treebank.tagged_sents()[6000:]
		else:
			test_sents = sentences
		print self._tagger.evaluate(test_sents)
Example #5
0
def tag_matching(sequences):

    treebank_sentences = treebank.tagged_sents()
    #treebank_sentences = brown.tagged_sents()

    # Return best count/sequence
    best = (0, None)

    count = 0
    errors = 0

    resultset = []

    for seq in sequences:
        for sent in treebank_sentences:
            for i, word in enumerate(sent):
                if sent[i][1] == seq[0]:
                    try:
                        if sent[i+1][1] == seq[1]:
                            count += 1
                            #if sent[i+2][1] == seq[2]:
                            #   count += 1
                    except IndexError:
                        errors += 1
        if count > best[0]:
            best = (count, seq)
        resultset.append((seq, count, errors))
        count, erros = 0, 0
    return resultset
Example #6
0
def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11)*100:])
    s.train(d[(11)*100:])

    for i in range(10):
        tacc = t.evaluate(d[i*100:((i+1)*100)])
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print('Capitalization off:')
        print('Accuracy:', tacc)
        print('Percentage known:', tp_kn)
        print('Percentage unknown:', tp_un)
        print('Accuracy over known words:', (tacc / tp_kn))

        sacc = s.evaluate(d[i*100:((i+1)*100)])
        sp_un = float(s.unknown) / float(s.known +s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print('Capitalization on:')
        print('Accuracy:', sacc)
        print('Percentage known:', sp_kn)
        print('Percentage unknown:', sp_un)
        print('Accuracy over known words:', (sacc / sp_kn))
def main():
    ### Globals ###
    regexp_tagger = nltk.RegexpTagger(
           [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
           (r'(The|the|A|a|An|an)$', 'AT'),   # articles
           (r'.*able$', 'JJ'),                # adjectives
           (r'.*ness$', 'NN'),                # nouns formed from adjectives
           (r'.*ly$', 'RB'),                  # adverbs
           (r'.*s$', 'NNS'),                  # plural nouns
           (r'.*ing$', 'VBG'),                # gerunds
           (r'.*ed$', 'VBD'),                 # past tense verbs
           (r'.*', 'NN')                      # nouns (default)
    ])

    training_data = treebank.tagged_sents()
           
    unigram_tagger = nltk.UnigramTagger(training_data, backoff=regexp_tagger)
    bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger)
    trigram_tagger = nltk.TrigramTagger(training_data, backoff=bigram_tagger)

    unigram_pickler = pickle.Pickler(open("unigram_tagger.bin","w"))
    bigram_pickler = pickle.Pickler(open("bigram_tagger.bin","w"))
    trigram_pickler = pickle.Pickler(open("trigram_tagger.bin","w"))

    unigram_pickler.dump(unigram_tagger)
    bigram_pickler.dump(bigram_tagger)
    trigram_pickler.dump(trigram_tagger)
Example #8
0
 def traintest_bigram_trigram_tagger(self):
     from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger 
     from nltk.corpus import treebank        
     test_sents  = treebank.tagged_sents()[3000:]          
     train_sents = treebank.tagged_sents()[:3000]
     
     print 'trainging bigramTagger'                
     bitagger = BigramTagger(train_sents)
     print 'evaluation bitagger'
     print bitagger.evaluate(test_sents)
     
     print 'trainging trigram Tagger'
     tritagger = TrigramTagger(train_sents)
     print 'evaluation bitagger'
     print tritagger.evaluate(test_sents)
     print 'tagging'
Example #9
0
File: bench.py Project: seomoz/mltk
def benchmark_aptagger():
    '''
    Benchmark the aptagger vs the Penn Treebank sample in nltk
    '''
    from nltk.corpus import treebank

    # we want to remove "-NONE-" tags since these appear to be garbage
    text = []
    tags = []
    k = 0
    for sentence in treebank.tagged_sents():
        text.append([ele[0] for ele in sentence if ele[1] != '-NONE-'])
        tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-'])
        k += 1

    t1 = time.time()
    predicted = tagger.tag_sents(text)
    t2 = time.time()

    ncorrect = sum(bool(t == p[1])
        for t, p in izip(tags, chain.from_iterable(predicted)))

    print("For Penn Treebank sample in NLTK:")
    print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" % (
        t2 - t1, len(tags), int(len(tags) / (t2 - t1))))
    print("Accuracy: %s" % (float(ncorrect) / len(tags)))
def create_input_dataset():
	print 'Loading input'
	input_data = []
	tags = []
	sents = wsj.sents()
	json_file  = open('data.json','w') 
	counter = 0
	for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
		prev = None
		prev_prev = None
		for j,word in enumerate(sentence):
			datapoint = {}
			temp = []
			len_sentence = len(sentence)

			
			if(j > 0):
				temp.append(sents[i][j-1])
			else:
				temp.append('*')
			if(j > 1):
				temp.append(sents[i][j-2])
			else:
				temp.append('*')
			temp.append(sents[i][j])
			if(j < len_sentence-1):
				temp.append(sents[i][j+1])
			else:
				temp.append('*')
			if(j < len_sentence-2):
				temp.append(sents[i][j+2])
			else:
				temp.append('*')

			datapoint['wn'] = temp
			
			datapoint['index'] = j

			datapoint['i'] = counter
			counter += 1
			if(prev == None):
				datapoint['t_minus_one'] = '*'
			else:
				datapoint['t_minus_one'] = prev[1]
			if(prev_prev == None):
				datapoint['t_minus_two'] = '*'
			else:
				datapoint['t_minus_two'] = prev_prev[1]

			prev_prev = prev
			prev = word
			# print datapoint,word[1]
			datapoint['tag'] = word[1]
			json_file.write(json.dumps(datapoint))
			json_file.write('\n')
			input_data.append(datapoint)
			tags.append(word[1])
	print 'Done'
	json_file.close()
	return input_data, tags
Example #11
0
def get_pos_tagger():
    train_sents = treebank.tagged_sents()
    tagger = nltk.TrigramTagger(train_sents, backoff=
        nltk.BigramTagger(train_sents, backoff=
        nltk.UnigramTagger(train_sents, backoff=
        nltk.DefaultTagger("NN"))))
    return tagger
Example #12
0
def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data):
    # train is the proportion of data used in training; the rest is reserved
    # for testing.
    if tagged_data is None:
        print("Loading tagged data from treebank... ")
        tagged_data = treebank.tagged_sents()
    if num_sents is None or len(tagged_data) <= num_sents:
        num_sents = len(tagged_data)
    if randomize:
        random.seed(len(tagged_data))
        random.shuffle(tagged_data)
    cutoff = int(num_sents * train)
    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:num_sents]
    testing_data = [[t[0] for t in sent] for sent in gold_data]
    if not separate_baseline_data:
        baseline_data = training_data
    else:
        bl_cutoff = len(training_data) // 3
        (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:])
    (trainseqs, traintokens) = corpus_size(training_data)
    (testseqs, testtokens) = corpus_size(testing_data)
    (bltrainseqs, bltraintokens) = corpus_size(baseline_data)
    print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens))
    print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens))
    print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
        bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]"))
    return (training_data, baseline_data, gold_data, testing_data)
Example #13
0
def make_sentences():
    dictionary = [k.strip() for k in open("./embeddings/words.lst")]
    ind_lookup = {word:(ind+1) for ind,word in enumerate(dictionary)}

    taglst = [k.strip() for k in open("data/tags.lst")]
    tag_lookup = {word:(ind+1) for ind,word in enumerate(taglst)}

    bracket_rep = { "-LRB-":"(",
                    "-RRB-":")",
                    "-RSB-":"[",
                    "-RSB-":"]",
                    "-LCB-":"{",
                    "-RCB-":"}"}

    sentences = list(treebank.tagged_sents())
    for i,sent in enumerate(sentences):
        sent = [(item.lower(),tag) for (item,tag) in sent if tag != '-NONE-']
        sent = [(bracket_rep.get(item, item), tag)                          for (item,tag) in sent]
        sent = [(u'0', tag) if item[0].isdigit() else (item,tag)            for (item,tag) in sent]
        sent = [(u"UNKNOWN", tag) if item not in ind_lookup else (item,tag) for (item,tag) in sent]
        # 1 indexed!!!
        sent = [(ind_lookup[item], tag_lookup[tag])                         for (item,tag) in sent]
        sentences[i] = sent

    sentences = [i for i in sentences if len(i) > 4]
    print(sum(map(len, sentences)) / float(len(sentences)))

    return sentences
Example #14
0
 def split_sents(self, train=0.95, total=3500,
                 document_class=TaggedSentence):
     sents = tagged_corpus.tagged_sents()[:total]
     total = len(sents) if total is None else total
     i = int(round(train * total))
     j = i + int(round(total - train * total))
     return (map(document_class, sents[0:i]),
             map(document_class, sents[i:j]))
def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]

    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]

    elif corpus.lower() == "floresta":
        from nltk.corpus import floresta
        tagged_sents = floresta.tagged_sents()[:num_sents]

    elif corpus.lower() == "cintil":
        print "Loading CINTIL"
        #column_types = ['ignore','words','ignore','ignore','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types)
        column_types = ['words','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types)
        cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types)
        tagged_sents = cintil.tagged_sents()[:num_sents]

    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)

    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)

    maxent_tagger.evaluate(test_sents)

    """
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"])
    print "\n\n"
    print "show the 40 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(40)
    """

    fModel = open('test.pkl',"wb")
    pickle.dump(maxent_tagger, fModel,1)
    fModel.close()
def get_tagger():
    try:
        with open(tagger_fn) as tagger_file:
            tagger = pickle.load(tagger_file)
    except:
        tagger = ClassifierBasedPOSTagger(train=treebank.tagged_sents())
        with open(tagger_fn,"w") as tagger_file:
            pickle.dump(tagger,tagger_file)
    return tagger
Example #17
0
    def run(self):

        app = App.get_running_app()

        print 'start training TnT pos tagger'
        train_sents = treebank.tagged_sents()[:2000]
        unk = DefaultTagger('NN')
        app.root.tnt_tagger = tnt.TnT(unk=unk, Trained=True)
        app.root.tnt_tagger.train(train_sents)
        print 'end training TnT pos tagger'
	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)
Example #19
0
def create_dataset():
	#print 'Loading dataset'
	dataset = []
	tags = []
	sents = wsj.sents()

	for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
		prev = None
		prev_prev = None
		for j,word in enumerate(sentence):
			datapoint = {}
			temp = []
			
			len_sentence = len(sentence)
			
			if(j > 0):
				temp.append(sents[i][j-1])
			else:
				temp.append('*')
			if(j > 1):
				temp.append(sents[i][j-2])
			else:
				temp.append('*')
			
			temp.append(sents[i][j])

			if(j < len_sentence-1):
				temp.append(sents[i][j+1])
			else:
				temp.append('*')
			if(j < len_sentence-2):
				temp.append(sents[i][j+2])
			else:
				temp.append('*')

			#what is WN ?
			datapoint['wn'] = temp
			
			datapoint['index'] = j
			if(prev == None):
				datapoint['t_minus_one'] = '*'
			else:
				datapoint['t_minus_one'] = prev[1]
			if(prev_prev == None):
				datapoint['t_minus_two'] = '*'
			else:
				datapoint['t_minus_two'] = prev_prev[1]

			prev_prev = prev
			prev = word
			# print datapoint,word[1]
			dataset.append(datapoint)
			tags.append(word[1])
	#print 'Done'
	return dataset, tags
Example #20
0
File: brill.py Project: 4li/nlp
    def train_parser(self):
        default_tagger = DefaultTagger("NN")
        train_sents = treebank.tagged_sents()[:3000]
        initial_tagger = self.backoff_tagger(
            train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger
        )
        initial_tagger.evaluate(train_sents)
        brill_tagger = self.train_brill_tagger(initial_tagger, train_sents)

        pickle.dump(brill_tagger, open(self.pickle_path, "wb"))
        return brill_tagger
Example #21
0
def make_backoff_tagger():
	""" Returns a backoff tagger that useses a UnigramTagger,
	BigramTagger, TrigramTagger, and a Default tagger that returns NN

	:returns: A backoff POS tagger.

	"""

	return backoff_tagger(treebank.tagged_sents(), 
		[UnigramTagger, BigramTagger, TrigramTagger],
		backoff=DefaultTagger('NN'))
Example #22
0
def train_pos_tagger():
  """
  Trains a POS tagger with sentences from Penn Treebank
  and returns it.
  """
  train_sents = treebank.tagged_sents(simplify_tags=True)
  tagger = nltk.TrigramTagger(train_sents, backoff=
    nltk.BigramTagger(train_sents, backoff=
    nltk.UnigramTagger(train_sents, backoff=
    nltk.DefaultTagger("NN"))))
  return tagger
Example #23
0
def create_dataset():
    print "Loading dataset"
    dataset = []
    tags = []
    sents = wsj.sents()

    for i, sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
        prev = None
        prev_prev = None
        for j, word in enumerate(sentence):
            datapoint = {}
            temp = []
            len_sentence = len(sentence)

            if j > 0:
                temp.append(sents[i][j - 1])
            else:
                temp.append("*")
            if j > 1:
                temp.append(sents[i][j - 2])
            else:
                temp.append("*")

            temp.append(sents[i][j])

            if j < len_sentence - 1:
                temp.append(sents[i][j + 1])
            else:
                temp.append("*")
            if j < len_sentence - 2:
                temp.append(sents[i][j + 2])
            else:
                temp.append("*")

            datapoint["wn"] = temp

            datapoint["index"] = j
            if prev == None:
                datapoint["t_minus_one"] = "*"
            else:
                datapoint["t_minus_one"] = prev[1]
            if prev_prev == None:
                datapoint["t_minus_two"] = "*"
            else:
                datapoint["t_minus_two"] = prev_prev[1]

            prev_prev = prev
            prev = word
            # print datapoint,word[1]
            dataset.append(datapoint)
            tags.append(word[1])
    print "Done"
    return dataset, tags
Example #24
0
 def traintest_uni_bi_tri_tagger(self):
     from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger
     from nltk.corpus import conll2000, treebank    
     test_sents  = conll2000.tagged_sents()[8000:]          
     train_sents = treebank.tagged_sents()[3000:]
     print 'trainging trigramter with backoff'
     backoff = DefaultTagger('NN')
     tagger = self.backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff)
     print 'evaluation trigram with backoff'        
     print tagger.evaluate(test_sents)
     print 'tagging'
     print tagger.tag(word_tokenize("This is a test. This should be faster than nothing. How can I rent a car in the next twelve hours? "))
def process(data):
    processed_tweets = []
    t0 = AffixTagger(train=treebank.tagged_sents())
    t1 = UnigramTagger(train=treebank.tagged_sents(), backoff=t0)
    t2 = BigramTagger(train=treebank.tagged_sents(), backoff=t1)
    count = 0
    for tweet in data.get_tweets():
        count += 1
        print count
        tweet = remove_hashtags(tweet)
        tweet = remove_user_tags(tweet)
        tweet = remove_html_entities(tweet)
        tweet = remove_punctuation_deep(tweet)
        tweet = tokenize_and_remove_stopwords(tweet)
        tweet = remove_apostrophes(tweet)
        tweet = remove_multiple_spaces(tweet)
        tweet = translate_slang(tweet)
        tweet = pos_tag_filter(tweet, data, t2)
        if not is_empty(tweet):
            processed_tweets.append(tweet)
    data.set_tweets(processed_tweets)
Example #26
0
   def evaluate(self):
      '''run tests on conll2000 and treebank data'''

      test = treebank.tagged_sents()[:100]
      treebank_result = (100*self.classifier.evaluate(test))

      test = conll2000.tagged_sents()[:100]
      conll2000_result = (100*self.classifier.evaluate(test))

      test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):]
      brown_result = (100*self.classifier.evaluate(test))

      return (treebank_result, conll2000_result, brown_result)
Example #27
0
 def from_treebank(klass):
     from nltk.corpus import brown, treebank
     probdist = klass()
     for sent in treebank.tagged_sents():
         for word, tag in sent:
             probdist.inc(word.lower(), tag)
     for sent in treebank_brown.tagged_sents():
         for word, tag in sent:
             probdist.inc(word.lower(), tag)
     for word, tag in get_lexicon():
         probdist.inc(word, tag, closed_class=False)
     for i in range(10): probdist.inc('can', 'VB')
     return probdist
 def LemmatizeSents(self,sents):
     tagger=tagging(treebank.tagged_sents(),[UnigramTagger,BigramTagger,TrigramTagger],backoff=None)
     newSents=[]
     for sent in sents:
         taggedSent=tagger.tag(word_tokenize(sent))
         words=[]
         for (wd,tg) in taggedSent:
             newTag=self.tagMap(tg)
             wd=WordNetLemmatizer().lemmatize(wd,newTag)
             words=words+[wd]
         newSent=' '.join(words)
         #print(newSent)
         newSents.append(newSent)
     return newSents
Example #29
0
    def __init__(self, train_set='treebank'):
        '''
        Constructor
        '''

        # Before building a new tagger check if one has already been pickled
        if (os.path.exists(os.getcwd() + '/' + _pickle_file)):
            input = open(_pickle_file, 'rb')
            self._tagger = load(input)
            input.close()
            input = open(_test_sents_pickle_file, 'rb')
            self._test_sents = load(input)
            input.close()
            
        # Primitives necessary for training the Brill tagger.
        # Taken from cookbook
        else:
            if train_set == 'treebank':
                tagged_sents = list(treebank.tagged_sents())
            else:
                tagged_sents = list(brown.tagged_sents())
            random.shuffle(tagged_sents)
            split_index = int(round(0.8 * len(tagged_sents)))
            train_sents = tagged_sents[:split_index]
            self._test_sents = tagged_sents[split_index:]
            default_tagger = DefaultTagger('NN')
            tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]
            initial_tagger = backoff_tagger(train_sents, tagger_classes, backoff=default_tagger)
            sym_bounds = [(1,1), (2,2), (1,2), (1,3)]
            asym_bounds = [(-1, -1), (1,1)]
            templates = [
                brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, *sym_bounds),
                brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, *sym_bounds),
                brill.ProximateTokensTemplate(brill.ProximateTagsRule, *asym_bounds),
                brill.ProximateTokensTemplate(brill.ProximateWordsRule, *asym_bounds)]

            # Train the tagger
            trainer = brill.FastBrillTaggerTrainer(initial_tagger, templates, deterministic=True)
            self._tagger = trainer.train(train_sents)

            #Pickle the trained tagger
            if not os.path.exists(os.getcwd() + '/pickles/'):
                os.mkdir(os.getcwd() + '/pickles/')
            output = open(_pickle_file, 'wb')
            dump(self._tagger, output, -1)
            output.close()
            output = open(_test_sents_pickle_file, 'wb')
            dump(self._test_sents, output, -1)
            output.close()
def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags