Python TaggedCorpusReader.tagged_sents Beispiele, nltk.corpus.reader.TaggedCorpusReader.tagged_sents Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: pos_crf_cv_quick.py Projekt: wencanluo/cltk_pos

def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)
    
    stdout_old = sys.stdout
    
    sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w')  
    
    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    
    sys.stdout.flush()
    
    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)
    
    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)
    
    sys.stdout = stdout_old

Beispiel #2

0

Datei anzeigen

def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old

Beispiel #3

0

Datei anzeigen

    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        reader = TaggedCorpusReader(corpusroot, corpusname)
   
        self.reader_train = reader.tagged_sents()
        self.test_sent = reader.tagged_sents()[1000:]

Beispiel #4

0

Datei anzeigen

Datei: kabbot.py Projekt: gnespatel1618/KabBot

 def get_brill_tagger(self):
     train_data = TaggedCorpusReader('.',
                                     'tagged_input_sentences.txt',
                                     sep="/")
     traindata = list(train_data.tagged_sents())
     postag = load('taggers/maxent_treebank_pos_tagger/english.pickle')
     templates = [
         brill.Template(brill.Pos([-1])),
         brill.Template(brill.Pos([1])),
         brill.Template(brill.Pos([-2])),
         brill.Template(brill.Pos([2])),
         brill.Template(brill.Pos([-2, -1])),
         brill.Template(brill.Pos([1, 2])),
         brill.Template(brill.Pos([-3, -2, -1])),
         brill.Template(brill.Pos([1, 2, 3])),
         brill.Template(brill.Pos([-1]), brill.Pos([1])),
         brill.Template(brill.Word([-1])),
         brill.Template(brill.Word([1])),
         brill.Template(brill.Word([-2])),
         brill.Template(brill.Word([2])),
         brill.Template(brill.Word([-2, -1])),
         brill.Template(brill.Word([1, 2])),
         brill.Template(brill.Word([-3, -2, -1])),
         brill.Template(brill.Word([1, 2, 3])),
         brill.Template(brill.Word([-1]), brill.Word([1]))
     ]
     trainer = BrillTaggerTrainer(postag, templates=templates, trace=3)
     brill_tagger = trainer.train(traindata, max_rules=10)
     return brill_tagger

Beispiel #5

0

Datei anzeigen

def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(
        model_type,
        time.time() - now, path))

Beispiel #6

0

Datei anzeigen

def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs

Beispiel #7

0

Datei anzeigen

Datei: make_pos_models.py Projekt: wencanluo/greek_treebank_perseus

def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))

Beispiel #8

0

Datei anzeigen

Datei: sequentigram.py Projekt: drr3d/BimaNLP

    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        #nltk_old = [(3,0,1)]
        #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])]

        reader = TaggedCorpusReader(corpusroot, corpusname)

        splitratio = 0.8
   
        self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)]
        self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] 

        print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n"
        print "reader_train len: ", len(self.reader_train)
        print "test_sent len: ", len(self.test_sent)

Beispiel #9

0

Datei anzeigen

    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        #nltk_old = [(3,0,1)]
        #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])]

        reader = TaggedCorpusReader(corpusroot, corpusname)

        splitratio = 0.8
   
        self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)]
        self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] 

        print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n"
        print "reader_train len: ", len(self.reader_train)
        print "test_sent len: ", len(self.test_sent)

Beispiel #10

0

Datei anzeigen

Datei: controller.py Projekt: nicorotstein/juan_alberto

    def load_corpus_reviews(self,begin,end):
        #reader = LazyCorpusLoader()
        reader = TaggedCorpusReader('data/', r'.*\.pos')

        pos_fileids = reader.fileids()[1]
        neg_fileids = reader.fileids()[0]

        pos_sents = reader.tagged_sents(pos_fileids)
        neg_sents = reader.tagged_sents(neg_fileids)

        return (pos_sents[begin:end], neg_sents[begin:end])

Beispiel #11

0

Datei anzeigen

 def read(self, file_path):
     logger.info('Reading instances from file %s', file_path)
     reader = TaggedCorpusReader(*os.path.split(file_path),
                                 sep='\t',
                                 word_tokenizer=RegexpTokenizer(r'\n',
                                                                gaps=True),
                                 sent_tokenizer=BlanklineTokenizer(),
                                 para_block_reader=lambda s: [s.read()])
     return Dataset([
         self.text_to_instance(*tuple(zip(*tagged_sent)))
         for tagged_sent in reader.tagged_sents()
     ])

Beispiel #12

0

Datei anzeigen

def make_morpho_model(language,
                      model_type,
                      feature,
                      train_file,
                      test_file=None):
    test_file = train_file if test_file == None else test_file

    reader_train = TaggedCorpusReader('.', train_file)
    reader_test = TaggedCorpusReader('.', test_file)
    train_sents = reader_train.tagged_sents()
    test_sents = reader_test.tagged_sents()

    verify_tagged_corpus(reader_train)
    verify_tagged_corpus(reader_test)

    tagger = train_tagger(language, model_type, feature, train_sents)

    acc = tagger.evaluate(test_sents)
    baseline = compute_baseline(reader_test.tagged_words())
    kappa = (acc - baseline) / (1 - baseline)

    cm = conf_matrix(tagger, reader_test.words(), reader_test.tagged_words())

    return (tagger, acc, kappa, cm)

Beispiel #13

0

Datei anzeigen

Datei: classification.py Projekt: nicorotstein/juan_alberto

def read_sentences_corpus(reader = None):
	#reader = LazyCorpusLoader()
	#its overriding reader
	reader = TaggedCorpusReader('../data/', r'.*\.pos')
	'''
	create a corpus reader with the files in ../data/*.pos 
	this files contains sentences tagged, and are the bases of trainig, test sets. 
	'''

	pos_fileids = reader.fileids()[1]
	neg_fileids = reader.fileids()[0]

	pos_sents = reader.tagged_sents(pos_fileids)
	neg_sents = reader.tagged_sents(neg_fileids)

	#pos_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in pos_sents ]
	#neg_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in neg_sents ]

	return (pos_sents,neg_sents)

Beispiel #14

0

Datei anzeigen

class CorpusParser:
    def __init__(self, root, fileids='.*', encoding='utf8'):
        """
        Reads all the files in root.

        :param root: Directory.
        :param fileids: List of files that have to be read. '.*' if all files have to be parsed.
        :param encoding: File enconding
        """
        self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)

    def words(self):
        """
        Returns all the words in the corpora.

        :return: List of words.
        """
        return self._reader.words()

    def tagged_words(self):
        """
        Returns all words of the corpora with their corresponding tag.

        :return: List of tuples (word, tag)
        """
        return self._reader.tagged_words()

    def sentences(self):
        """
        Returns a list of all sentences.

        :return: List of lists of words. Each list represents a sentence, with a list of its words in it.
        """
        return self._reader.sents()

    def tagged_sentences(self):
        """
        Returns a list of all sentences with the tag of each word.

        :return: List of lists of tuples. Each sentence is a list with all its members being tuples (word, tag).
        """
        return self._reader.tagged_sents()

Beispiel #15

0

Datei anzeigen

    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")

Beispiel #16

0

Datei anzeigen

Datei: Treebank.py Projekt: rickstello/hmm-tagger

    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")

Beispiel #17

0

Datei anzeigen

Datei: training_tagger.py Projekt: Temerrut/TextRank

# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.',
                          r'.*\.sdx',
                          sep='|',
                          sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

## print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)

# Training set
training_data = tagged_data_list[:cutoff]

Beispiel #18

0

Datei anzeigen

Datei: test_train_morpho_tagger.py Projekt: cltk/ang_models_cltk

 def setUp(self):
     reader = TaggedCorpusReader('./corpora/oe', 'oe_train.pos')
     os.system('mkdir -p taggers/oe/pos')
     self.sents = reader.tagged_sents()

Beispiel #19

0

Datei anzeigen

def split_10fold(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i + n]

    # a list of 10 lists
    ten_parts = list(chunks(
        pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = [
            item.rstrip() for item in ten_parts[counter] if len(item) > 0
        ]  # or: test_set = part

        if counter == 1:
            print(len(test_set[993]), len(test_set[994]), len(test_set[995]),
                  len(test_set[996]))

        # filter out this loop's test index
        training_set_lists = [
            x for x in ten_parts if x is not ten_parts[counter]
        ]

        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [
            item.rstrip() for sublist in training_set_lists for item in sublist
            if len(item) > 0
        ]

        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test_%d.pos' % counter)
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
        test_sents = test_reader.tagged_sents()

        test_sents_tex = []
        for test_sent in test_sents:
            test_sents_tex.append(' '.join([token
                                            for token, tag in test_sent]))
        test_text_path = os.path.join(local_dir, 'test_%d.txt' % counter)
        with open(test_text_path, 'w') as f:
            f.write('\n'.join(test_sents_tex))

        test_path = os.path.join(local_dir, 'test_%d.pos' % counter)
        with open(test_path, 'w') as f:
            f.write('\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train_%d.pos' % counter)
        with open(train_path, 'w') as f:
            f.write('\n'.join(training_set))

Beispiel #20

0

Datei anzeigen

Datei: Extractor.py Projekt: svetakrasikova/Terminology

		storedModel = "/var/log/Terminology/pos_model_tnt.bin"
	else:
		storedModel = "/var/log/Terminology/pos_model_brill.bin"

	if os.path.isfile(storedModel):
		Service.logger.debug("Loading stored POS tagger model from %s" % storedModel)
		modelFile = open(storedModel, "rb")
		try:
			pos_tagger = cPickle.load(modelFile)
		except Exception, e:
			Servide.logger.debug("Exception while loading pickled POS model!")
			Service.logger.debug(Service.traceback.format_exc())
		modelFile.close()
	else:
		autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8')
		train_sents =  autodesk.tagged_sents() + treebank.tagged_sents()
	
		# Use TnT tagger on request
		if useTnTTagger:
			if __debug_on__:
				Service.logger.debug("Using TnT POS tagger...")
			unk_tagger = DefaultTagger('NN')
	
			pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True)
			pos_tagger.train(train_sents)
		# Use Brill tagger by default
		else:
			if __debug_on__:
				Service.logger.debug("Using Brill POS tagger...")
	
			def backoff_tagger(tagged_sents, tagger_classes, backoff=None):

Beispiel #21

0

Datei anzeigen

Datei: Extractor.py Projekt: JongleurX/Terminology

def trainPOSTagger(useTnTTagger):
	global __debug_on__
	global pos_tagger
	global adskCorpusRoot
	# Train TNT/Brill POS-tagger using own training data + treebank data from nltk. Tested that using treebank data improves results.

	autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8')
	train_sents =  autodesk.tagged_sents() + treebank.tagged_sents()

	# Use TnT tagger on request
	if useTnTTagger:
		if __debug_on__:
			Service.logger.debug("Using TnT POS tagger...")
		unk_tagger = DefaultTagger('NN')

		pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True)
		pos_tagger.train(train_sents)
	# Use Brill tagger by default
	else:
		if __debug_on__:
			Service.logger.debug("Using Brill POS tagger...")

		def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
			if not backoff:
				backoff = tagger_classes[0](tagged_sents)
				del tagger_classes[0]
 
			for cls in tagger_classes:
				tagger = cls(tagged_sents, backoff=backoff)
				backoff = tagger
 
			return backoff
	
		word_patterns = [
			(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
			(r'.*ould$', 'MD'),
			(r'.*ing$', 'VBG'),
			(r'.*ed$', 'VBD'),
			(r'.*ness$', 'NN'),
			(r'.*ment$', 'NN'),
			(r'.*ful$', 'JJ'),
			(r'.*ious$', 'JJ'),
			(r'.*ble$', 'JJ'),
			(r'.*ic$', 'JJ'),
			(r'.*ive$', 'JJ'),
			(r'.*ic$', 'JJ'),
			(r'.*est$', 'JJ'),
			(r'^a$', 'PREP'),
		]
		raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.RegexpTagger(word_patterns))
 
		templates = [
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
			brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
			brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
		]
	 
		trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
		pos_tagger = trainer.train(train_sents, max_rules=200, min_score=3)

Beispiel #22

0

Datei anzeigen

Datei: NaiveBayesForNLP.py Projekt: legend507/CodeHome

class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot

    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__,
                                                 '.*\.txt',
                                                 sep='#')

    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile('，|。')
        return grammer.split(parag)

    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val

    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(
                        list(subtree))  # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)

    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features

    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)

    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non testing data
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)

Beispiel #23

0

Datei anzeigen

Datei: train_oe.py Projekt: lrosenb2/OldEnglish

# tagged_sentences = nltk.corpus.brown.tagged_sents()
from nltk.corpus.reader import TaggedCorpusReader
reader = TaggedCorpusReader('/Users/lucasrosenblatt/nltk_data/corpora/oldenglish', 'taggedOEnpnounsDone.pos')
tagged_sentences = reader.tagged_sents()
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
import pprint 
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))

Beispiel #24

0

Datei anzeigen

def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict

Beispiel #25

0

Datei anzeigen

Datei: pos_corpus.py Projekt: neuroph12/nlpy

from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))

Beispiel #26

0

Datei anzeigen

# # Brill Tagger #

# In[11]:

from nltk.wsd import lesk
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
import tkinter
from nltk.tag import brill, brill_trainer
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.data import load
from nltk.corpus.reader import TaggedCorpusReader


train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/")
traindata= list(train_data.tagged_sents())
postag= load('taggers/maxent_treebank_pos_tagger/english.pickle')

    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),

Beispiel #27

0

Datei anzeigen

Datei: tutCorpusReader02.py Projekt: bindaasamit/pycode

########## TAGGED CORPUS READER ###############

from nltk.corpus.reader import TaggedCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
file="brown.pos"
source=root+file

#Using Regex to match all files with extension .pos
reader=TaggedCorpusReader(root,r'.*\.pos')

print reader.words()
print reader.tagged_words()
print reader.sents()
print reader.tagged_sents()
print reader.paras()
print reader.tagged_paras()


#TaggedCorpus uses default tokenizer but we can change it by customizing it
from nltk.tokenize import SpaceTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer())
print reader.words()

#Customing TaggedCorpus's sentence tokenizer
from nltk.tokenize import LineTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer())
print reader.words()

#Customizing TaggedCorpus's paragraph Block reader
#Customizing TaggedCorpus's tag separator - Pg 57

Beispiel #28

0

Datei anzeigen

Datei: extract_tag.py Projekt: adtsegaye/moses-scripts

import sys
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import LineTokenizer

filename = sys.argv[1]
without_extension = filename.split('.')
file_address = filename.split('/')
directory = file_address[:-1]
directory_address = '/'.join('{}'.format(x) for x in directory) + '/'
corpus_reader = TaggedCorpusReader(directory_address, [filename],
                                   sent_tokenizer=LineTokenizer(),
                                   sep='|')
corpus = corpus_reader.tagged_sents()
new_tags_only = open(
    without_extension[0] + '_tag_sets.' + without_extension[1], 'a+')
count = 1
for each in corpus:
    new_tags_only.write(' '.join('{}'.format(x[1]) for x in each))
    new_tags_only.write('\n')
    print(count)
    count += 1
print(without_extension[1] + "Tag extracting finished")
new_tags_only.close()

Beispiel #29

0

Datei anzeigen

Datei: NaiveBayesForNLP.py Projekt: richzw/CodeHome

class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot
        
    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#')
    
    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile('，|。')
        return grammer.split(parag)
        
    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val
    
    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree)) # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)
    
    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features
        
    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)
    
    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non testing data            
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)

Beispiel #30

0

Datei anzeigen

import nltk
from nltk.tag import UnigramTagger
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import PunktWordTokenizer
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.tokenize.regexp import WhitespaceTokenizer
global corpus, sent_tags, tagger

# corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux
corpus = TaggedCorpusReader(
    'C:/Users/jose.adail/workspace/TextProcessor/names',
    r'.*\.txt',
    word_tokenizer=WhitespaceTokenizer(),
    sep="_")
name_tags = corpus.tagged_sents(
)  # Recebe as sentenças marcadas com POS_Tags.
tagger = UnigramTagger(
    name_tags
)  # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas.


class RegexpReplacer(object):
    def __init__(self):
        self.replacement_patterns = [(r"'", ''), (r'#', 'hash'),
                                     (r'no', 'no_'), (r'not', 'not_'),
                                     (r'RT ', ''), (r'rs[rs]+', 'rs'),
                                     (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'),
                                     (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
                                     (r'e[e]+', 'eqe'), (r'o[o]+', 'oqo'),
                                     (r'tt', 'tqt'), (r'ff', 'fqf'),
                                     (r'dd', 'dqd'), (r'mm', 'mqm'),

Beispiel #31

0

Datei anzeigen

def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make crf tagger
        crf_tagger = CRFTagger()
        crf_tagger.train(train_sents, 'model.crf.tagger')
        
        # evaluate crf tagger
        crf_accuracy = None
        crf_accuracy = crf_tagger.evaluate(test_sents)
        crf_accuracies.append(crf_accuracy)
        print('crf:', crf_accuracy)

        #if counter> 0: break
        
    final_accuracies_list = []
    mean_accuracy_crf = mean(crf_accuracies)
    standard_deviation_crf = stdev(crf_accuracies)
    uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}}
    final_accuracies_list.append(uni)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict

Beispiel #32

0

Datei anzeigen

Datei: tagger.py Projekt: ixtel/wpfc

import nltk
from nltk.tag import RegexpTagger
from nltk.corpus.reader import TaggedCorpusReader

reader = TaggedCorpusReader('corpus','tagged_corpus')
train = reader.tagged_sents()

tagger0 = nltk.DefaultTagger('n')
tagger1 = nltk.UnigramTagger(train,backoff=tagger0)
tagger2 = nltk.BigramTagger(train,backoff=tagger1)
patterns = [
    (r'^\d+((.|,)\d+)?\.?$', 'NC'),
    (r'^.*\$$','$'),
    (r'R\$\d+((.|,)\d+)?\.?$','NC$'),
    (r'^(R|r)eais$','$'),
    (r'^(D|d)(o|ó)lares','$')
]
tagger3 = RegexpTagger(patterns,backoff=tagger2)

def tag(sent):
    result = tagger3.tag(sent.split())

    return result

Beispiel #33

0

Datei anzeigen

Datei: CustomCorpora.py Projekt: AbhideepRND/NLTK

import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())

Beispiel #34

0

Datei anzeigen

Datei: pos_corpus.py Projekt: anderscui/nlpy

from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))

Beispiel #35

0

Datei anzeigen

Datei: training_tagger.py Projekt: bugraoral/TextRank


# Brill tagger parameters
max_rules=300
min_score=3

# Training parameters
development_size=5110
train=.85


# Read data from development.sdx
data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()


# Lower words and return as a list
tagged_data_list  = [[t for t in sent] for sent in tagged_data] 
tagged_data_list = [[(w.lower(),t) for (w,t) in s] for s in tagged_data_list]

## print "Data is read! " 

# Randomize training and evaluation set
random.seed(len(tagged_data_list)) 
random.shuffle(tagged_data_list) 
cutoff = int(development_size*train)

# Training set
training_data = tagged_data_list[:cutoff]

Beispiel #36

0

Datei anzeigen

Datei: pos_lapos_cv.py Projekt: wencanluo/cltk_pos

def split_10fold(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = [item.rstrip() for item in ten_parts[counter] if len(item) > 0]  # or: test_set = part
        
        if counter==1:
            print(len(test_set[993]),len(test_set[994]),len(test_set[995]),len(test_set[996]))
    
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item.rstrip() for sublist in training_set_lists for item in sublist if len(item) > 0]
        
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test_%d.pos'%counter)
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))
        
        test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
        test_sents = test_reader.tagged_sents()
        
        test_sents_tex = []
        for test_sent in test_sents:
            test_sents_tex.append(' '.join([token for token,tag in test_sent]))
        test_text_path = os.path.join(local_dir, 'test_%d.txt'%counter)
        with open(test_text_path, 'w') as f:
            f.write('\n'.join(test_sents_tex))
        
        test_path = os.path.join(local_dir, 'test_%d.pos'%counter)
        with open(test_path, 'w') as f:
            f.write('\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train_%d.pos'%counter)
        with open(train_path, 'w') as f:
            f.write('\n'.join(training_set))

Beispiel #37

0

Datei anzeigen

Datei: taggers.py Projekt: Mirith/Trigram-tagger

from nltk.corpus.reader import TaggedCorpusReader
from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.probability import FreqDist
from numpy import mean
# for kfold validation, not working though
# cross-fold validation is just brute forced...
#from sklearn.model_selection import KFold
#import numpy as np


mypath = "C:/Users/Lauren Shin/Documents/LING 111/.final project"

EstonianCorpus = TaggedCorpusReader(mypath, "estonianCaps.txt", encoding = "latin-1")

sentences = EstonianCorpus.tagged_sents()

tags = [tag for _, tag in EstonianCorpus.tagged_words()]
mostFrequent = FreqDist(tags).max()

default = DefaultTagger(mostFrequent)

# cross validation

#kf = KFold(n_splits = 3)
#
## turns the data into a 2d array
#X = np.array(sentences)
## creates a 1d array with same length/number of rows as X
#y = np.arange(0, len(sentences), 1)
#

Beispiel #38

0

Datei anzeigen

Datei: NERHINDI1.py Projekt: subhabangalore/ML-Codes

def NER_HINDI():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents)
    test = hmm_tagger.test(test_sents)
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = hmm_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
    #INPUT FROM FILE
    with open('HINDIHMMNER1.dill', 'wb') as f:
        dill.dump(hmm_tagger, f)
    with open('HINDIHMMNER1.dill', 'rb') as f:
        hmm_tagger1 = dill.load(f)

    test_tags = [
        tag for sent in reader.sents() for (word, tag) in hmm_tagger1.tag(sent)
    ]
    gold_tags = [tag for (word, tag) in reader.tagged_words()]
    ltesttag = len(test_tags)
    lgtags = len(gold_tags)
    print "Test Tag Len:", ltesttag
    print "Gold Tag Len:", lgtags
    cm = nltk.ConfusionMatrix(gold_tags, test_tags)
    print(cm.pretty_format(sort_by_count=True, show_percents=False,
                           truncate=5))
    labels = set('NA GPE PERS DATE  ORG'.split()
                 )  #THE TAG SETS AS GENERATED IN CONFUSION MATRIX
    true_positives = Counter()
    false_negatives = Counter()
    false_positives = Counter()
    for i in labels:
        for j in labels:
            if i == j:
                true_positives[i] += cm[i, j]
            else:
                false_negatives[i] += cm[i, j]
                false_positives[j] += cm[i, j]
    print "TP:", sum(true_positives.values()), true_positives
    print "FN:", sum(false_negatives.values()), false_negatives
    print "FP:", sum(false_positives.values()), false_positives
    print

    for i in sorted(labels):
        if true_positives[i] == 0:
            fscore = 0
        else:
            precision = true_positives[i] / float(true_positives[i] +
                                                  false_positives[i])
            recall = true_positives[i] / float(true_positives[i] +
                                               false_negatives[i])
            fscore = 2 * (precision * recall) / float(precision + recall)
            fscore1 = fscore * 100
            print "TAG:", i, "FMEASURE:", fscore1