Beispiel #1
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old
Beispiel #2
0
def main():
    """main function
    """
    n = 2  # Bigram HMM
    args = parse_arguments()
    treebank = TaggedCorpusReader(
        os.path.split(args.train_f)[0],
        os.path.split(args.train_f)[1])
    observation_space = [item[0] for item in treebank.sents()]  # all words
    state_space = [item[1] for item in treebank.sents()]  # all pos tags

    words = dict.fromkeys(observation_space)
    tags = dict.fromkeys(state_space)

    # HMM parameter estimation- initial, transition and emission probablity
    start = time.time()
    init_p = [item[1] for item in comp_initial(tags, treebank)]
    trans_p = comp_transition(n, tags, state_space)
    emission_p = comp_emission(words,
                               tags,
                               state_space,
                               treebank,
                               smoothing=args.smoothing)
    end = time.time()
    print("Runtime (training): %.3f s" % (end - start))

    # Test your HMM-trained model
    treebank = TaggedCorpusReader(
        os.path.split(args.eval_f)[0],
        os.path.split(args.eval_f)[1])
    viterbi_tags = []

    start = time.time()
    for sentence in treebank.paras():
        test_words = [item[0] for item in sentence]
        O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p,
                                        trans_p, emission_p)
        # Computes Viterbi's most likely tags

        if args.log_prob:
            X = viterbi_log(O, S, Y, pi, A, B)
        else:
            X = viterbi(O, S, Y, pi, A, B)
        viterbi_tags.append(X)
    end = time.time()

    print("Runtime (viterbi): %.3f s" % (end - start))
    output_path = "./" + "de-tagger.tt"
    post_processing(viterbi_tags, args.test_f, output_path)
Beispiel #3
0
def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(
        model_type,
        time.time() - now, path))
Beispiel #4
0
 def get_brill_tagger(self):
     train_data = TaggedCorpusReader('.',
                                     'tagged_input_sentences.txt',
                                     sep="/")
     traindata = list(train_data.tagged_sents())
     postag = load('taggers/maxent_treebank_pos_tagger/english.pickle')
     templates = [
         brill.Template(brill.Pos([-1])),
         brill.Template(brill.Pos([1])),
         brill.Template(brill.Pos([-2])),
         brill.Template(brill.Pos([2])),
         brill.Template(brill.Pos([-2, -1])),
         brill.Template(brill.Pos([1, 2])),
         brill.Template(brill.Pos([-3, -2, -1])),
         brill.Template(brill.Pos([1, 2, 3])),
         brill.Template(brill.Pos([-1]), brill.Pos([1])),
         brill.Template(brill.Word([-1])),
         brill.Template(brill.Word([1])),
         brill.Template(brill.Word([-2])),
         brill.Template(brill.Word([2])),
         brill.Template(brill.Word([-2, -1])),
         brill.Template(brill.Word([1, 2])),
         brill.Template(brill.Word([-3, -2, -1])),
         brill.Template(brill.Word([1, 2, 3])),
         brill.Template(brill.Word([-1]), brill.Word([1]))
     ]
     trainer = BrillTaggerTrainer(postag, templates=templates, trace=3)
     brill_tagger = trainer.train(traindata, max_rules=10)
     return brill_tagger
Beispiel #5
0
def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
Beispiel #6
0
    def __init__(self, root, fileids='.*', encoding='utf8'):
        """
        Reads all the files in root.

        :param root: Directory.
        :param fileids: List of files that have to be read. '.*' if all files have to be parsed.
        :param encoding: File enconding
        """
        self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)
Beispiel #7
0
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        reader = TaggedCorpusReader(corpusroot, corpusname)
   
        self.reader_train = reader.tagged_sents()
        self.test_sent = reader.tagged_sents()[1000:] 
Beispiel #8
0
 def read(self, file_path):
     logger.info('Reading instances from file %s', file_path)
     reader = TaggedCorpusReader(*os.path.split(file_path),
                                 sep='\t',
                                 word_tokenizer=RegexpTokenizer(r'\n',
                                                                gaps=True),
                                 sent_tokenizer=BlanklineTokenizer(),
                                 para_block_reader=lambda s: [s.read()])
     return Dataset([
         self.text_to_instance(*tuple(zip(*tagged_sent)))
         for tagged_sent in reader.tagged_sents()
     ])
 def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file):
     corpus = \
         TaggedCorpusReader(ngram_directory,
                            ngram_file,
                            sent_tokenizer=LineTokenizer(blanklines='discard'),
                            encoding='utf-8')
     corpus_paras = corpus.paras()[:]
     k = corpus_paras[::2]
     for i in range(2):
         k = list(chain(*k))
     v = corpus_paras[1::2]
     ngrams_by_topic_from_file = \
         {k.encode('utf-8'): list(set(chain(*v)))
            for k, v in dict(izip(k, v)).items()}
     return ngrams_by_topic_from_file
Beispiel #10
0
def make_morpho_model(language,
                      model_type,
                      feature,
                      train_file,
                      test_file=None):
    test_file = train_file if test_file == None else test_file

    reader_train = TaggedCorpusReader('.', train_file)
    reader_test = TaggedCorpusReader('.', test_file)
    train_sents = reader_train.tagged_sents()
    test_sents = reader_test.tagged_sents()

    verify_tagged_corpus(reader_train)
    verify_tagged_corpus(reader_test)

    tagger = train_tagger(language, model_type, feature, train_sents)

    acc = tagger.evaluate(test_sents)
    baseline = compute_baseline(reader_test.tagged_words())
    kappa = (acc - baseline) / (1 - baseline)

    cm = conf_matrix(tagger, reader_test.words(), reader_test.tagged_words())

    return (tagger, acc, kappa, cm)
Beispiel #11
0
def read_reviews():
    """
    read reviews from the given file(s).
    """
    from glob import glob
    filenames = glob("input/food*.parsed")

    sent_end_pattern = ".\/[,\.]"
    reader = TaggedCorpusReader(root=".",
                                fileids=filenames,
                                sep="/",
                                sent_tokenizer=RegexpTokenizer(
                                    sent_end_pattern, gaps=True))

    li = reader.sents()
    return li
Beispiel #12
0
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        #nltk_old = [(3,0,1)]
        #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])]

        reader = TaggedCorpusReader(corpusroot, corpusname)

        splitratio = 0.8
   
        self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)]
        self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] 

        print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n"
        print "reader_train len: ", len(self.reader_train)
        print "test_sent len: ", len(self.test_sent)
Beispiel #13
0
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(
         re.segmented_reports_corpus_path,
         '.*',
         sent_tokenizer=LineTokenizer(blanklines='discard'),
         encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(
             new_corpus_of_segmented_reports.sents(
                 fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(
             raw_segmented_reports[i]
             [raw_segmented_reports[i].index([topics[0].decode('utf-8')]):
              raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) +
              1])
     return cut_of_segmented_reports, topics
Beispiel #15
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make crf tagger
        crf_tagger = CRFTagger()
        crf_tagger.train(train_sents, 'model.crf.tagger')
        
        # evaluate crf tagger
        crf_accuracy = None
        crf_accuracy = crf_tagger.evaluate(test_sents)
        crf_accuracies.append(crf_accuracy)
        print('crf:', crf_accuracy)

        #if counter> 0: break
        
    final_accuracies_list = []
    mean_accuracy_crf = mean(crf_accuracies)
    standard_deviation_crf = stdev(crf_accuracies)
    uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}}
    final_accuracies_list.append(uni)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
 def setUp(self):
     reader = TaggedCorpusReader('./corpora/oe', 'oe_train.pos')
     os.system('mkdir -p taggers/oe/pos')
     self.sents = reader.tagged_sents()
Beispiel #17
0
from nltk import tag
from nltk.tag import brill
from nltk.tag import brill_trainer
import pickle

# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.',
                          r'.*\.sdx',
                          sep='|',
                          sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

## print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)
Beispiel #18
0
def split_10fold(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i + n]

    # a list of 10 lists
    ten_parts = list(chunks(
        pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = [
            item.rstrip() for item in ten_parts[counter] if len(item) > 0
        ]  # or: test_set = part

        if counter == 1:
            print(len(test_set[993]), len(test_set[994]), len(test_set[995]),
                  len(test_set[996]))

        # filter out this loop's test index
        training_set_lists = [
            x for x in ten_parts if x is not ten_parts[counter]
        ]

        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [
            item.rstrip() for sublist in training_set_lists for item in sublist
            if len(item) > 0
        ]

        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test_%d.pos' % counter)
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
        test_sents = test_reader.tagged_sents()

        test_sents_tex = []
        for test_sent in test_sents:
            test_sents_tex.append(' '.join([token
                                            for token, tag in test_sent]))
        test_text_path = os.path.join(local_dir, 'test_%d.txt' % counter)
        with open(test_text_path, 'w') as f:
            f.write('\n'.join(test_sents_tex))

        test_path = os.path.join(local_dir, 'test_%d.pos' % counter)
        with open(test_path, 'w') as f:
            f.write('\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train_%d.pos' % counter)
        with open(train_path, 'w') as f:
            f.write('\n'.join(training_set))
Beispiel #19
0
# http://stevenloria.com/how-to-build-a-text-classification-system-with-python-and-textblob/

import nltk
from textblob.classifiers import NaiveBayesClassifier
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import sent_tokenize, word_tokenize

reader = TaggedCorpusReader('.', 'idn.tsv')

txt1 = """Presiden meresmikan kereta api super cepat Jakarta Bandung."""
sent_tokenize(txt1)
print word_tokenize(sent_tokenize(txt1)[0])

Beispiel #20
0
# -*- coding: latin-1 -*-
import re
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import PunktWordTokenizer
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.tokenize.regexp import WhitespaceTokenizer
global corpus, sent_tags, tagger

# corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux
corpus = TaggedCorpusReader(
    'C:/Users/jose.adail/workspace/TextProcessor/names',
    r'.*\.txt',
    word_tokenizer=WhitespaceTokenizer(),
    sep="_")
name_tags = corpus.tagged_sents(
)  # Recebe as sentenças marcadas com POS_Tags.
tagger = UnigramTagger(
    name_tags
)  # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas.


class RegexpReplacer(object):
    def __init__(self):
        self.replacement_patterns = [(r"'", ''), (r'#', 'hash'),
                                     (r'no', 'no_'), (r'not', 'not_'),
                                     (r'RT ', ''), (r'rs[rs]+', 'rs'),
                                     (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'),
                                     (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
Beispiel #21
0
import sys
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import LineTokenizer

filename = sys.argv[1]
without_extension = filename.split('.')
file_address = filename.split('/')
directory = file_address[:-1]
directory_address = '/'.join('{}'.format(x) for x in directory) + '/'
corpus_reader = TaggedCorpusReader(directory_address, [filename],
                                   sent_tokenizer=LineTokenizer(),
                                   sep='|')
corpus = corpus_reader.tagged_sents()
new_tags_only = open(
    without_extension[0] + '_tag_sets.' + without_extension[1], 'a+')
count = 1
for each in corpus:
    new_tags_only.write(' '.join('{}'.format(x[1]) for x in each))
    new_tags_only.write('\n')
    print(count)
    count += 1
print(without_extension[1] + "Tag extracting finished")
new_tags_only.close()
Beispiel #22
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
from nltk.tag import brill_trainer
from nltk.tbl import Template
from nltk.tokenize import BlanklineTokenizer

# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.',
                          r'.*\.sdx',
                          sep='|',
                          sent_tokenizer=BlanklineTokenizer(),
                          encoding='ISO-8859-9')

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

# print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
    print("Sentence - %s\n" % (sent[x]))
    print("Words - %s\n" % (nltk.word_tokenize(sent[x])))

## Reading corpora from a text files ##########
## No POS tags, chunks or categories ##########
reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg",
                               r'^.*\.txt')
files = reader.fileids()
print("File IDs:", files)
print("Number of files:", len(files))
print(reader.words(files[0]))
print(reader.sents(files[0]))

## Reading tagged corpora #####################
reader = TaggedCorpusReader('/Users/atul/nltk_data',
                            r'brown.pos',
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
print(reader.tagged_words())
print(reader.tagged_sents())
print(
    reader.tagged_words(tagset='universal')
)  ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK

## Reading chunk corpora #######
reader = ChunkedCorpusReader('/Users/atul/nltk_data',
Beispiel #25
0
# tagged_sentences = nltk.corpus.brown.tagged_sents()
from nltk.corpus.reader import TaggedCorpusReader
reader = TaggedCorpusReader('/Users/lucasrosenblatt/nltk_data/corpora/oldenglish', 'taggedOEnpnounsDone.pos')
tagged_sentences = reader.tagged_sents()
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
import pprint 
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
Beispiel #26
0
 def initClassifier(self):
     self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#')
     self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__,
                                              '.*\.txt',
                                              sep='#')
Beispiel #27
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
Beispiel #28
0
"""

from nltk.corpus.reader import TaggedCorpusReader
from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.probability import FreqDist
from numpy import mean
# for kfold validation, not working though
# cross-fold validation is just brute forced...
#from sklearn.model_selection import KFold
#import numpy as np


mypath = "C:/Users/Lauren Shin/Documents/LING 111/.final project"

EstonianCorpus = TaggedCorpusReader(mypath, "estonianCaps.txt", encoding = "latin-1")

sentences = EstonianCorpus.tagged_sents()

tags = [tag for _, tag in EstonianCorpus.tagged_words()]
mostFrequent = FreqDist(tags).max()

default = DefaultTagger(mostFrequent)

# cross validation

#kf = KFold(n_splits = 3)
#
## turns the data into a 2d array
#X = np.array(sentences)
## creates a 1d array with same length/number of rows as X
Beispiel #29
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
Beispiel #30
0
# # Brill Tagger #

# In[11]:

from nltk.wsd import lesk
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
import tkinter
from nltk.tag import brill, brill_trainer
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.data import load
from nltk.corpus.reader import TaggedCorpusReader


train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/")
traindata= list(train_data.tagged_sents())
postag= load('taggers/maxent_treebank_pos_tagger/english.pickle')

    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),