コード例 #1
0
ファイル: kabbot.py プロジェクト: gnespatel1618/KabBot
 def get_brill_tagger(self):
     train_data = TaggedCorpusReader('.',
                                     'tagged_input_sentences.txt',
                                     sep="/")
     traindata = list(train_data.tagged_sents())
     postag = load('taggers/maxent_treebank_pos_tagger/english.pickle')
     templates = [
         brill.Template(brill.Pos([-1])),
         brill.Template(brill.Pos([1])),
         brill.Template(brill.Pos([-2])),
         brill.Template(brill.Pos([2])),
         brill.Template(brill.Pos([-2, -1])),
         brill.Template(brill.Pos([1, 2])),
         brill.Template(brill.Pos([-3, -2, -1])),
         brill.Template(brill.Pos([1, 2, 3])),
         brill.Template(brill.Pos([-1]), brill.Pos([1])),
         brill.Template(brill.Word([-1])),
         brill.Template(brill.Word([1])),
         brill.Template(brill.Word([-2])),
         brill.Template(brill.Word([2])),
         brill.Template(brill.Word([-2, -1])),
         brill.Template(brill.Word([1, 2])),
         brill.Template(brill.Word([-3, -2, -1])),
         brill.Template(brill.Word([1, 2, 3])),
         brill.Template(brill.Word([-1]), brill.Word([1]))
     ]
     trainer = BrillTaggerTrainer(postag, templates=templates, trace=3)
     brill_tagger = trainer.train(traindata, max_rules=10)
     return brill_tagger
コード例 #2
0
def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
コード例 #3
0
def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(
        model_type,
        time.time() - now, path))
コード例 #4
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)
    
    stdout_old = sys.stdout
    
    sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w')  
    
    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    
    sys.stdout.flush()
    
    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)
    
    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)
    
    sys.stdout = stdout_old
コード例 #5
0
def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
コード例 #6
0
    def __init__(self, root, fileids='.*', encoding='utf8'):
        """
        Reads all the files in root.

        :param root: Directory.
        :param fileids: List of files that have to be read. '.*' if all files have to be parsed.
        :param encoding: File enconding
        """
        self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)
コード例 #7
0
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        reader = TaggedCorpusReader(corpusroot, corpusname)
   
        self.reader_train = reader.tagged_sents()
        self.test_sent = reader.tagged_sents()[1000:] 
コード例 #8
0
ファイル: chinesereader.py プロジェクト: fannix/Utils
 def __init__(self, sep="/", 
              # Note that . needs to be escaped
              pattern = chinese_pattern, 
              root=None, fileids=None):
     """docstring for __init__"""
     TaggedCorpusReader.__init__(
         self,
         sep=sep, root=root, fileids=fileids,
         sent_tokenizer = RegexpTokenizer(pattern, gaps=True),
         encoding="utf-8")
コード例 #9
0
    def load_corpus_reviews(self,begin,end):
        #reader = LazyCorpusLoader()
        reader = TaggedCorpusReader('data/', r'.*\.pos')

        pos_fileids = reader.fileids()[1]
        neg_fileids = reader.fileids()[0]

        pos_sents = reader.tagged_sents(pos_fileids)
        neg_sents = reader.tagged_sents(neg_fileids)

        return (pos_sents[begin:end], neg_sents[begin:end])
コード例 #10
0
ファイル: classifier.py プロジェクト: ixtel/wpfc
def list_of_words_for(category,limit=20):
    category_reader = TaggedCorpusReader('corpus',category)

    most_freq_words = []

    for w,t in category_reader.tagged_words():
        if t not in ["PRP","NC","$","$NC"]:
            most_freq_words.append(w.lower())
    pos_counts = collections.Counter(w for w in most_freq_words)
    result = [word for word, count in pos_counts.most_common(limit)]
    return result
コード例 #11
0
 def read(self, file_path):
     logger.info('Reading instances from file %s', file_path)
     reader = TaggedCorpusReader(*os.path.split(file_path),
                                 sep='\t',
                                 word_tokenizer=RegexpTokenizer(r'\n',
                                                                gaps=True),
                                 sent_tokenizer=BlanklineTokenizer(),
                                 para_block_reader=lambda s: [s.read()])
     return Dataset([
         self.text_to_instance(*tuple(zip(*tagged_sent)))
         for tagged_sent in reader.tagged_sents()
     ])
コード例 #12
0
ファイル: mte.py プロジェクト: jatanpatel92/nltk
    def __init__(self, root=None, fileids=None, encoding='utf8'):
        """
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') 

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param enconding: The encoding of the given files (default is utf8)
        """
        TaggedCorpusReader.__init__(self, root, fileids, encoding)
コード例 #13
0
 def __init__(self,
              file_name,
              language='',
              separator='_',
              ws_delim=True,
              number_of_groups=10,
              encoding='utf-8'):
     """Initialize the corpus reader."""
     TaggedCorpusReader.__init__(self,
                                 root='.',
                                 fileids=[file_name],
                                 sep=separator,
                                 encoding=encoding)
コード例 #14
0
ファイル: mte.py プロジェクト: Copper-Head/nltk
    def __init__(self, root=None, fileids=None, encoding='utf8'):
        """
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param enconding: The encoding of the given files (default is utf8)
        """
        TaggedCorpusReader.__init__(self, root, fileids, encoding)
コード例 #15
0
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(re.segmented_reports_corpus_path, '.*',
                                                          sent_tokenizer=LineTokenizer(blanklines='discard'),
                                                          encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(new_corpus_of_segmented_reports.sents(fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(raw_segmented_reports[i][raw_segmented_reports[i].index([topics[0].decode('utf-8')]):raw_segmented_reports[i].index([topics[-1].decode('utf-8')])+1])    
     return cut_of_segmented_reports, topics
コード例 #16
0
ファイル: chinesereader.py プロジェクト: fannix/Utils
 def __init__(
         self,
         sep="/",
         # Note that . needs to be escaped
         pattern=chinese_pattern,
         root=None,
         fileids=None):
     """docstring for __init__"""
     TaggedCorpusReader.__init__(self,
                                 sep=sep,
                                 root=root,
                                 fileids=fileids,
                                 sent_tokenizer=RegexpTokenizer(pattern,
                                                                gaps=True),
                                 encoding="utf-8")
コード例 #17
0
 def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file):
     corpus = \
         TaggedCorpusReader(ngram_directory,
                            ngram_file,
                            sent_tokenizer=LineTokenizer(blanklines='discard'),
                            encoding='utf-8')
     corpus_paras = corpus.paras()[:]
     k = corpus_paras[::2]
     for i in range(2):
         k = list(chain(*k))
     v = corpus_paras[1::2]
     ngrams_by_topic_from_file = \
         {k.encode('utf-8'): list(set(chain(*v)))
            for k, v in dict(izip(k, v)).items()}
     return ngrams_by_topic_from_file
コード例 #18
0
def read_reviews():
    """
    read reviews from the given file(s).
    """
    from glob import glob
    filenames = glob("input/food*.parsed")
    
    sent_end_pattern = ".\/[,\.]"
    reader = TaggedCorpusReader(
        root = ".",
        fileids = filenames,
        sep = "/",
        sent_tokenizer = RegexpTokenizer(sent_end_pattern, gaps=True))

    li = reader.sents()
    return li
コード例 #19
0
def read_reviews():
    """
    read reviews from the given file(s).
    """
    from glob import glob
    filenames = glob("input/food*.parsed")

    sent_end_pattern = ".\/[,\.]"
    reader = TaggedCorpusReader(root=".",
                                fileids=filenames,
                                sep="/",
                                sent_tokenizer=RegexpTokenizer(
                                    sent_end_pattern, gaps=True))

    li = reader.sents()
    return li
コード例 #20
0
class CorpusAnalysis():
    def __init__(self):
        self.punctuation = ['.',',',';','!','?','_','"','&',"'"]
        self.load()
        
    def load(self):
        from nltk.corpus.reader import TaggedCorpusReader
        from nltk.tokenize import WordPunctTokenizer
        self.reader = TaggedCorpusReader('../data/', r'.*\.pos')

    def words(self):
        print self.reader.words(['rev_pos.pos'])

    def ngrams(self,words,n=0):
        
        from nltk.corpus import stopwords
        word_list2 = [w for w in words if not w in stopwords.words('english') and not w in punctutation]

        wprev,wprev1,wprev2 = None,None,None
        for i in range(len(word_list2)):
            w = word_list2[i] 
            yield (wprev,wprev1,wprev2,w)
            wprev = wprev1
            wprev1 = wprev2
            wprev2 = w

    def freq_dist_words(self):
        from nltk import ConditionalFreqDist
        from nltk.model import NgramModel
        categories = ['rev_neg.pos','rev_pos.pos']
        cfd = ConditionalFreqDist((category, word) for category in categories for word in c.ngrams(c.reader.words(category)))       
        genres = ['rev_neg.pos', 'rev_pos.pos']
        modals = ['location','room','size','staff','excellent','poor','good','bad']

        print 'neg :', cfd.__getitem__('rev_neg.pos')       
        print 'pos :', cfd.__getitem__('rev_pos.pos')
        #lm = NgramModel(4, self.reader.words(['rev_neg.pos']))

    def freq_dist_tags(self):
        from nltk import ConditionalFreqDist
        from nltk.model import NgramModel        
        cfd = ConditionalFreqDist((tag,word) for (word,tag)  in c.reader.tagged_words(self.cat_pos) if word.isalpha())
        
        return cfd

    def MI(self):
        pass
コード例 #21
0
    def take_ngrams_by_topic_from_file(self, 
		                               ngram_directory, 
		                               ngram_file):
        corpus = \
            TaggedCorpusReader(ngram_directory, 
                               ngram_file, 
                               sent_tokenizer=LineTokenizer(blanklines='discard'), 
                               encoding='utf-8')
        corpus_paras = corpus.paras()[:]
        k = corpus_paras[::2]
        for i in range(2):
            k = list(chain(*k))
        v = corpus_paras[1::2]
        ngrams_by_topic_from_file = \
            {k.encode('utf-8'): list(set(chain(*v))) 
               for k, v in dict(izip(k, v)).items()}
        return ngrams_by_topic_from_file
コード例 #22
0
def read_sentences_corpus(reader = None):
	#reader = LazyCorpusLoader()
	#its overriding reader
	reader = TaggedCorpusReader('../data/', r'.*\.pos')
	'''
	create a corpus reader with the files in ../data/*.pos 
	this files contains sentences tagged, and are the bases of trainig, test sets. 
	'''

	pos_fileids = reader.fileids()[1]
	neg_fileids = reader.fileids()[0]

	pos_sents = reader.tagged_sents(pos_fileids)
	neg_sents = reader.tagged_sents(neg_fileids)

	#pos_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in pos_sents ]
	#neg_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in neg_sents ]

	return (pos_sents,neg_sents)
コード例 #23
0
class CorpusParser:
    def __init__(self, root, fileids='.*', encoding='utf8'):
        """
        Reads all the files in root.

        :param root: Directory.
        :param fileids: List of files that have to be read. '.*' if all files have to be parsed.
        :param encoding: File enconding
        """
        self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)

    def words(self):
        """
        Returns all the words in the corpora.

        :return: List of words.
        """
        return self._reader.words()

    def tagged_words(self):
        """
        Returns all words of the corpora with their corresponding tag.

        :return: List of tuples (word, tag)
        """
        return self._reader.tagged_words()

    def sentences(self):
        """
        Returns a list of all sentences.

        :return: List of lists of words. Each list represents a sentence, with a list of its words in it.
        """
        return self._reader.sents()

    def tagged_sentences(self):
        """
        Returns a list of all sentences with the tag of each word.

        :return: List of lists of tuples. Each sentence is a list with all its members being tuples (word, tag).
        """
        return self._reader.tagged_sents()
コード例 #24
0
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
コード例 #25
0
ファイル: Treebank.py プロジェクト: rickstello/hmm-tagger
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
コード例 #26
0
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(
         re.segmented_reports_corpus_path,
         '.*',
         sent_tokenizer=LineTokenizer(blanklines='discard'),
         encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(
             new_corpus_of_segmented_reports.sents(
                 fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(
             raw_segmented_reports[i]
             [raw_segmented_reports[i].index([topics[0].decode('utf-8')]):
              raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) +
              1])
     return cut_of_segmented_reports, topics
コード例 #27
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old
コード例 #28
0
ファイル: hmm_train.py プロジェクト: rrichajalota/ws-20-21
def main():
    """main function
    """
    n = 2  # Bigram HMM
    args = parse_arguments()
    treebank = TaggedCorpusReader(
        os.path.split(args.train_f)[0],
        os.path.split(args.train_f)[1])
    observation_space = [item[0] for item in treebank.sents()]  # all words
    state_space = [item[1] for item in treebank.sents()]  # all pos tags

    words = dict.fromkeys(observation_space)
    tags = dict.fromkeys(state_space)

    # HMM parameter estimation- initial, transition and emission probablity
    start = time.time()
    init_p = [item[1] for item in comp_initial(tags, treebank)]
    trans_p = comp_transition(n, tags, state_space)
    emission_p = comp_emission(words,
                               tags,
                               state_space,
                               treebank,
                               smoothing=args.smoothing)
    end = time.time()
    print("Runtime (training): %.3f s" % (end - start))

    # Test your HMM-trained model
    treebank = TaggedCorpusReader(
        os.path.split(args.eval_f)[0],
        os.path.split(args.eval_f)[1])
    viterbi_tags = []

    start = time.time()
    for sentence in treebank.paras():
        test_words = [item[0] for item in sentence]
        O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p,
                                        trans_p, emission_p)
        # Computes Viterbi's most likely tags

        if args.log_prob:
            X = viterbi_log(O, S, Y, pi, A, B)
        else:
            X = viterbi(O, S, Y, pi, A, B)
        viterbi_tags.append(X)
    end = time.time()

    print("Runtime (viterbi): %.3f s" % (end - start))
    output_path = "./" + "de-tagger.tt"
    post_processing(viterbi_tags, args.test_f, output_path)
コード例 #29
0
def make_morpho_model(language,
                      model_type,
                      feature,
                      train_file,
                      test_file=None):
    test_file = train_file if test_file == None else test_file

    reader_train = TaggedCorpusReader('.', train_file)
    reader_test = TaggedCorpusReader('.', test_file)
    train_sents = reader_train.tagged_sents()
    test_sents = reader_test.tagged_sents()

    verify_tagged_corpus(reader_train)
    verify_tagged_corpus(reader_test)

    tagger = train_tagger(language, model_type, feature, train_sents)

    acc = tagger.evaluate(test_sents)
    baseline = compute_baseline(reader_test.tagged_words())
    kappa = (acc - baseline) / (1 - baseline)

    cm = conf_matrix(tagger, reader_test.words(), reader_test.tagged_words())

    return (tagger, acc, kappa, cm)
コード例 #30
0
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        #nltk_old = [(3,0,1)]
        #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])]

        reader = TaggedCorpusReader(corpusroot, corpusname)

        splitratio = 0.8
   
        self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)]
        self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] 

        print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n"
        print "reader_train len: ", len(self.reader_train)
        print "test_sent len: ", len(self.test_sent)
コード例 #31
0
ファイル: sequentigram.py プロジェクト: drr3d/BimaNLP
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        #nltk_old = [(3,0,1)]
        #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])]

        reader = TaggedCorpusReader(corpusroot, corpusname)

        splitratio = 0.8
   
        self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)]
        self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] 

        print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n"
        print "reader_train len: ", len(self.reader_train)
        print "test_sent len: ", len(self.test_sent)
コード例 #32
0
class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot

    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__,
                                                 '.*\.txt',
                                                 sep='#')

    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile(',|。')
        return grammer.split(parag)

    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val

    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(
                        list(subtree))  # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)

    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features

    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)

    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non testing data
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)
コード例 #33
0
ファイル: CustomCorpora.py プロジェクト: AbhideepRND/NLTK
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
コード例 #34
0
ファイル: train_oe.py プロジェクト: lrosenb2/OldEnglish
# tagged_sentences = nltk.corpus.brown.tagged_sents()
from nltk.corpus.reader import TaggedCorpusReader
reader = TaggedCorpusReader('/Users/lucasrosenblatt/nltk_data/corpora/oldenglish', 'taggedOEnpnounsDone.pos')
tagged_sentences = reader.tagged_sents()
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
import pprint 
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
コード例 #35
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make crf tagger
        crf_tagger = CRFTagger()
        crf_tagger.train(train_sents, 'model.crf.tagger')
        
        # evaluate crf tagger
        crf_accuracy = None
        crf_accuracy = crf_tagger.evaluate(test_sents)
        crf_accuracies.append(crf_accuracy)
        print('crf:', crf_accuracy)

        #if counter> 0: break
        
    final_accuracies_list = []
    mean_accuracy_crf = mean(crf_accuracies)
    standard_deviation_crf = stdev(crf_accuracies)
    uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}}
    final_accuracies_list.append(uni)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
コード例 #36
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
コード例 #37
0
########## TAGGED CORPUS READER ###############

from nltk.corpus.reader import TaggedCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
file="brown.pos"
source=root+file

#Using Regex to match all files with extension .pos
reader=TaggedCorpusReader(root,r'.*\.pos')

print reader.words()
print reader.tagged_words()
print reader.sents()
print reader.tagged_sents()
print reader.paras()
print reader.tagged_paras()


#TaggedCorpus uses default tokenizer but we can change it by customizing it
from nltk.tokenize import SpaceTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer())
print reader.words()

#Customing TaggedCorpus's sentence tokenizer
from nltk.tokenize import LineTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer())
print reader.words()

#Customizing TaggedCorpus's paragraph Block reader
#Customizing TaggedCorpus's tag separator - Pg 57
コード例 #38
0
 def load(self):
     from nltk.corpus.reader import TaggedCorpusReader
     from nltk.tokenize import WordPunctTokenizer
     self.reader = TaggedCorpusReader('../data/', r'.*\.pos')
コード例 #39
0
ファイル: Extractor.py プロジェクト: JongleurX/Terminology
def trainPOSTagger(useTnTTagger):
	global __debug_on__
	global pos_tagger
	global adskCorpusRoot
	# Train TNT/Brill POS-tagger using own training data + treebank data from nltk. Tested that using treebank data improves results.

	autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8')
	train_sents =  autodesk.tagged_sents() + treebank.tagged_sents()

	# Use TnT tagger on request
	if useTnTTagger:
		if __debug_on__:
			Service.logger.debug("Using TnT POS tagger...")
		unk_tagger = DefaultTagger('NN')

		pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True)
		pos_tagger.train(train_sents)
	# Use Brill tagger by default
	else:
		if __debug_on__:
			Service.logger.debug("Using Brill POS tagger...")

		def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
			if not backoff:
				backoff = tagger_classes[0](tagged_sents)
				del tagger_classes[0]
 
			for cls in tagger_classes:
				tagger = cls(tagged_sents, backoff=backoff)
				backoff = tagger
 
			return backoff
	
		word_patterns = [
			(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
			(r'.*ould$', 'MD'),
			(r'.*ing$', 'VBG'),
			(r'.*ed$', 'VBD'),
			(r'.*ness$', 'NN'),
			(r'.*ment$', 'NN'),
			(r'.*ful$', 'JJ'),
			(r'.*ious$', 'JJ'),
			(r'.*ble$', 'JJ'),
			(r'.*ic$', 'JJ'),
			(r'.*ive$', 'JJ'),
			(r'.*ic$', 'JJ'),
			(r'.*est$', 'JJ'),
			(r'^a$', 'PREP'),
		]
		raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.RegexpTagger(word_patterns))
 
		templates = [
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
			brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
			brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
		]
	 
		trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
		pos_tagger = trainer.train(train_sents, max_rules=200, min_score=3)
コード例 #40
0
# http://stevenloria.com/how-to-build-a-text-classification-system-with-python-and-textblob/

import nltk
from textblob.classifiers import NaiveBayesClassifier
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import sent_tokenize, word_tokenize

reader = TaggedCorpusReader('.', 'idn.tsv')

txt1 = """Presiden meresmikan kereta api super cepat Jakarta Bandung."""
sent_tokenize(txt1)
print word_tokenize(sent_tokenize(txt1)[0])

コード例 #41
0
ファイル: pos_lapos_cv.py プロジェクト: wencanluo/cltk_pos
def split_10fold(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = [item.rstrip() for item in ten_parts[counter] if len(item) > 0]  # or: test_set = part
        
        if counter==1:
            print(len(test_set[993]),len(test_set[994]),len(test_set[995]),len(test_set[996]))
    
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item.rstrip() for sublist in training_set_lists for item in sublist if len(item) > 0]
        
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test_%d.pos'%counter)
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))
        
        test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
        test_sents = test_reader.tagged_sents()
        
        test_sents_tex = []
        for test_sent in test_sents:
            test_sents_tex.append(' '.join([token for token,tag in test_sent]))
        test_text_path = os.path.join(local_dir, 'test_%d.txt'%counter)
        with open(test_text_path, 'w') as f:
            f.write('\n'.join(test_sents_tex))
        
        test_path = os.path.join(local_dir, 'test_%d.pos'%counter)
        with open(test_path, 'w') as f:
            f.write('\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train_%d.pos'%counter)
        with open(train_path, 'w') as f:
            f.write('\n'.join(training_set))
コード例 #42
0
    print("Sentence - %s\n" % (sent[x]))
    print("Words - %s\n" % (nltk.word_tokenize(sent[x])))

## Reading corpora from a text files ##########
## No POS tags, chunks or categories ##########
reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg",
                               r'^.*\.txt')
files = reader.fileids()
print("File IDs:", files)
print("Number of files:", len(files))
print(reader.words(files[0]))
print(reader.sents(files[0]))

## Reading tagged corpora #####################
reader = TaggedCorpusReader('/Users/atul/nltk_data',
                            r'brown.pos',
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
print(reader.tagged_words())
print(reader.tagged_sents())
print(
    reader.tagged_words(tagset='universal')
)  ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK

## Reading chunk corpora #######
reader = ChunkedCorpusReader('/Users/atul/nltk_data',
コード例 #43
0
ファイル: pos_corpus.py プロジェクト: anderscui/nlpy
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
コード例 #44
0
ファイル: files.py プロジェクト: menzenski/Razmetka
 def __init__(self, file_name, language="", separator="_", ws_delim=True, number_of_groups=10, encoding="utf-8"):
     """Initialize the corpus reader."""
     TaggedCorpusReader.__init__(self, root=".", fileids=[file_name], sep=separator, encoding=encoding)
コード例 #45
0
# -*- coding: latin-1 -*-
import re
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import PunktWordTokenizer
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.tokenize.regexp import WhitespaceTokenizer
global corpus, sent_tags, tagger

# corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux
corpus = TaggedCorpusReader(
    'C:/Users/jose.adail/workspace/TextProcessor/names',
    r'.*\.txt',
    word_tokenizer=WhitespaceTokenizer(),
    sep="_")
name_tags = corpus.tagged_sents(
)  # Recebe as sentenças marcadas com POS_Tags.
tagger = UnigramTagger(
    name_tags
)  # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas.


class RegexpReplacer(object):
    def __init__(self):
        self.replacement_patterns = [(r"'", ''), (r'#', 'hash'),
                                     (r'no', 'no_'), (r'not', 'not_'),
                                     (r'RT ', ''), (r'rs[rs]+', 'rs'),
                                     (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'),
                                     (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
コード例 #46
0
 def initClassifier(self):
     self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#')
     self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__,
                                              '.*\.txt',
                                              sep='#')
コード例 #47
0
ファイル: training_tagger.py プロジェクト: bugraoral/TextRank
from nltk.tag import brill
from nltk.tag import brill_trainer
import pickle


# Brill tagger parameters
max_rules=300
min_score=3

# Training parameters
development_size=5110
train=.85


# Read data from development.sdx
data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()


# Lower words and return as a list
tagged_data_list  = [[t for t in sent] for sent in tagged_data] 
tagged_data_list = [[(w.lower(),t) for (w,t) in s] for s in tagged_data_list]

## print "Data is read! " 

# Randomize training and evaluation set
random.seed(len(tagged_data_list)) 
random.shuffle(tagged_data_list) 
cutoff = int(development_size*train)
コード例 #48
0
ファイル: training_tagger.py プロジェクト: Temerrut/TextRank
from nltk import tag
from nltk.tag import brill
from nltk.tag import brill_trainer
import pickle

# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.',
                          r'.*\.sdx',
                          sep='|',
                          sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

## print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)
コード例 #49
0
ファイル: NaiveBayesForNLP.py プロジェクト: richzw/CodeHome
 def initClassifier(self):
     self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#')
     self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#')
コード例 #50
0
ファイル: NaiveBayesForNLP.py プロジェクト: richzw/CodeHome
class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot
        
    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#')
    
    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile(',|。')
        return grammer.split(parag)
        
    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val
    
    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree)) # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)
    
    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features
        
    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)
    
    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non testing data            
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)
コード例 #51
0
 def setUp(self):
     reader = TaggedCorpusReader('./corpora/oe', 'oe_train.pos')
     os.system('mkdir -p taggers/oe/pos')
     self.sents = reader.tagged_sents()
コード例 #52
0
from nltk.tag import brill_trainer
from nltk.tbl import Template
from nltk.tokenize import BlanklineTokenizer

# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.',
                          r'.*\.sdx',
                          sep='|',
                          sent_tokenizer=BlanklineTokenizer(),
                          encoding='ISO-8859-9')

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

# print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
コード例 #53
0
ファイル: tagger.py プロジェクト: ixtel/wpfc
import nltk
from nltk.tag import RegexpTagger
from nltk.corpus.reader import TaggedCorpusReader

reader = TaggedCorpusReader('corpus','tagged_corpus')
train = reader.tagged_sents()

tagger0 = nltk.DefaultTagger('n')
tagger1 = nltk.UnigramTagger(train,backoff=tagger0)
tagger2 = nltk.BigramTagger(train,backoff=tagger1)
patterns = [
    (r'^\d+((.|,)\d+)?\.?$', 'NC'),
    (r'^.*\$$','$'),
    (r'R\$\d+((.|,)\d+)?\.?$','NC$'),
    (r'^(R|r)eais$','$'),
    (r'^(D|d)(o|ó)lares','$')
]
tagger3 = RegexpTagger(patterns,backoff=tagger2)

def tag(sent):
    result = tagger3.tag(sent.split())

    return result
コード例 #54
0
	if useTnTTagger:
		storedModel = "/var/log/Terminology/pos_model_tnt.bin"
	else:
		storedModel = "/var/log/Terminology/pos_model_brill.bin"

	if os.path.isfile(storedModel):
		Service.logger.debug("Loading stored POS tagger model from %s" % storedModel)
		modelFile = open(storedModel, "rb")
		try:
			pos_tagger = cPickle.load(modelFile)
		except Exception, e:
			Servide.logger.debug("Exception while loading pickled POS model!")
			Service.logger.debug(Service.traceback.format_exc())
		modelFile.close()
	else:
		autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8')
		train_sents =  autodesk.tagged_sents() + treebank.tagged_sents()
	
		# Use TnT tagger on request
		if useTnTTagger:
			if __debug_on__:
				Service.logger.debug("Using TnT POS tagger...")
			unk_tagger = DefaultTagger('NN')
	
			pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True)
			pos_tagger.train(train_sents)
		# Use Brill tagger by default
		else:
			if __debug_on__:
				Service.logger.debug("Using Brill POS tagger...")