def ARPosTag(self, List): patterns = [ ('^(الله|لله|ربنا|رب|إله)$','لفظ جلالة'), ('^(به|فيه|عنه|إليه|اليه|كل|بعض)$','حرف'), ('^(هذا|هذه|هذان|هاتان|هؤلاء|تلك|أولئك)$', 'اسم إشارة'), ('^(ثم|حتا|أو|أم|لكن|لا|مع)$', 'حرف عطف'), ('^(من|إلى|الى|عن|على|في|فى)$', 'حرف جر'), ('^(هى|هو|هي|هما|هم|هن)$', 'ضمير غائب'), ('^(أنت|أنتما|أنتم|أنتن|إياك|إياكما|إياكم|إياكن)$', 'ضمير متكلم'), ('^(كان|اصبح|أصبح|أمسى|امسى|ظل|اضحى|أضحى|بات|صار|ليس|ما زال|ما برح|ما انفك|ما دام|ما فتئ)$','كان وأخواتها'), ('^(إن|أن|ان|كأن|لكن|لعل|ليت)$','إن وأخواتها'), ('^(هل|من|أي|ما|ماذا|متى|أين|كيف|كم|لماذا|أنى|أيان)$', 'حرف /اسم استفهام'), ('^(حين|صباح|ظهر|ساعة|سنة|أمس|مساء)$', 'ظرف زمان'), ('^(فوق|تحت|أمام|وراء|حيث|دون)$', 'ظرف مكان'), ('^(الذي|التي|اللذان|اللتان|الذين|اللاتي|اللواتي|اللائي)$', 'اسم موصول'), ('([ا-ي]{3}ان)|([ا-ي]{3}ى)|([ا-ي]{3}ء)|[أا]حمر|[أا]صفر|[أا]خضر|رمادي|[أا]سود|[أا]زرق','صفة'), #('^([ا-ي]{2}ا[ا-ي])$|^([ا-ي]{2}و[ا-ي])$|^([ا-ي]{2}ي[ا-ي])$','صفة مشبهه باسم فاعل'), ('^([ا-ي]{3}ة)$|^(م[ا-ي]{2}و[ا-ي])$','اسم مفعول'), ('^(م[ا-ي]{3})$','اسمي الزمان والمكان'), ('^س?[نايت][ا-ي]{3,4}$|^[ا-ي]{3,4}$|^س?[نايت][ا-ي]ا[ا-ي]{2}$|^س?[نايت]ن[ا-ي]{3}$|^س?[نايت]ت[ا-ي]ا[ا-ي]{2}$|^[نايت]ست[ا-ي]{3}$|^[نايت]ت[ا-ي]{4}$','فعل'), ('^((وال)|(فال)|(بال)|(كال)|(ال)).+|^ت[ا-ي]{2}ي[ا-ي]$|^[ا-ي]{2}[واي][ا-ي]$', 'اسم'), ('.+((ائي)|(انك)|(انه)|(اؤك)|(اؤه)|(اءك)|(اءه)|(هما)|(كما)|(ات)|(ة))$|^[ا-ي]ا[ا-ي]{2}ة?$', 'اسم'), ('','اسم'), ] reg = RegexpTagger(patterns) tmpList = [] for k in List: tmp = araby.strip_tashkeel(k) tmp2='' for i in self.s2: if tmp.endswith(i): a=2 tmp2=tmp[0:-a] else: tmp2=tmp tmpList.append(tmp2) return reg.tag(tmpList)
class MaltParser(ParserI): def __init__(self, tagger=None): self.config_malt() self.mco = 'malt_temp' self._trained = False if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) def config_malt(self, bin=None, verbose=False): """ Configure NLTK's interface to the C{malt} package. This searches for a directory containing the malt jar :param bin: The full path to the C{malt} binary. If not specified, then nltk will search the system for a C{malt} binary; and if one is not found, it will raise a C{LookupError} exception. :type bin: str """ #: A list of directories that should be searched for the malt #: executables. This list is used by L{config_malt} when searching #: for the malt executables. _malt_path = ['.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin', '/usr/local/malt-1*', '/usr/local/bin/malt-1*', '/usr/local/malt-1*', '/usr/local/share/malt-1*'] # Expand wildcards in _malt_path: malt_path = reduce(add, map(glob.glob, _malt_path)) # Find the malt binary. self._malt_bin = find_binary('malt.jar', bin, searchpath=malt_path, env_vars=['MALTPARSERHOME'], url='http://w3.msi.vxu.se/~jha/maltparser/index.html', verbose=verbose) def parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: L{list} of L{string} :return: C{DependencyGraph} the dependency graph representation of the sentence """ taggedwords = self.tagger.tag(sentence) return self.tagged_parse(taggedwords, verbose) def raw_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: L{string} :return: C{DependencyGraph} the dependency graph representation of the sentence """ words = word_tokenize(sentence) return self.parse(words, verbose) def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: L{list} of (word, tag) L{tuple}s. :return: C{DependencyGraph} the dependency graph representation of the sentence """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") if not self._trained: raise Exception("Parser has not been trained. Call train() first.") input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll') output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll') execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse' if not verbose: execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out") f = None try: f = open(input_file, 'w') for (i, (word,tag)) in enumerate(sentence): f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (i+1, word, '_', tag, tag, '_', '0', 'a', '_', '_')) f.write('\n') f.close() cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse'] self._execute(cmd, 'parse', verbose) return DependencyGraph.load(output_file) finally: if f: f.close() def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of C{DependencyGraph}s :param depgraphs: list of C{DependencyGraph}s for training input data """ input_file = os.path.join(tempfile.gettempdir(),'malt_train.conll') f = None try: f = open(input_file, 'w') f.write('\n'.join([dg.to_conll(10) for dg in depgraphs])) finally: if f: f.close() self.train_from_file(input_file, verbose=verbose) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging f = None if hasattr(conll_file, 'zipfile'): zip_conll_file = conll_file conll_file = os.path.join(tempfile.gettempdir(),'malt_train.conll') conll_str = zip_conll_file.open().read() f = open(conll_file,'w') f.write(conll_str) f.close() cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % conll_file, '-m learn'] # p = subprocess.Popen(cmd, stdout=subprocess.PIPE, # stderr=subprocess.STDOUT, # stdin=subprocess.PIPE) # (stdout, stderr) = p.communicate() self._execute(cmd, 'train', verbose) self._trained = True def _execute(self, cmd, type, verbose=False): if not verbose: temp_dir = os.path.join(tempfile.gettempdir(), '') cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' % ((temp_dir, type)*2)) malt_exit = os.system(' '.join(cmd))
from nltk.tag import RegexpTagger # define regex tag patterns patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) print rt.evaluate(test_data) print rt.tag(tokens) ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data)
class MaltParser(ParserI): def __init__(self, tagger=None, mco=None, working_dir=None, additional_java_args=None): """ An interface for parsing with the Malt Parser. :param mco: The name of the pre-trained model. If provided, training will not be required, and MaltParser will use the model file in ${working_dir}/${mco}.mco. :type mco: str """ self.config_malt() self.mco = 'malt_temp' if mco is None else mco self.working_dir = tempfile.gettempdir() if working_dir is None\ else working_dir self.additional_java_args = [] if additional_java_args is None else additional_java_args self._trained = mco is not None if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) def config_malt(self, bin=None, verbose=False): """ Configure NLTK's interface to the ``malt`` package. This searches for a directory containing the malt jar :param bin: The full path to the ``malt`` binary. If not specified, then nltk will search the system for a ``malt`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """ #: A list of directories that should be searched for the malt #: executables. This list is used by ``config_malt`` when searching #: for the malt executables. _malt_path = ['.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin', '/usr/local/malt-1*', '/usr/local/bin/malt-1*', '/usr/local/malt-1*', '/usr/local/share/malt-1*'] # Expand wildcards in _malt_path: malt_path = reduce(add, map(glob.glob, _malt_path)) # Find the malt binary. self._malt_bin = find_binary('malt.jar', bin, searchpath=malt_path, env_vars=['MALTPARSERHOME'], url='http://www.maltparser.org/', verbose=verbose) def parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: list(str) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ return self.batch_parse([sentence], verbose)[0] def batch_parse(self, sentences, verbose=False): """ Use MaltParser to parse multiple sentence. Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this MaltParser instance's tagger. :param sentences: Input sentences to parse :type sentence: list(list(str)) :return: list(``DependencyGraph``) the dependency graph representation of each sentence """ tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences] return self.tagged_batch_parse(tagged_sentences, verbose) def raw_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: str :return: ``DependencyGraph`` the dependency graph representation of the sentence """ words = word_tokenize(sentence) return self.parse(words, verbose) def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ return self.tagged_batch_parse([sentence], verbose)[0] def tagged_batch_parse(self, sentences, verbose=False): """ Use MaltParser to parse multiple sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: list(``DependencyGraph``) the dependency graph representation of each sentence """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") if not self._trained: raise Exception("Parser has not been trained. Call train() first.") input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=self.working_dir, delete=False) output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=self.working_dir, delete=False) try: for sentence in sentences: for (i, (word, tag)) in enumerate(sentence, start=1): input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\ (i, word, '_', tag, tag, '_', '0', 'a', '_', '_') input_file.write(input_str.encode("utf8")) input_file.write(b'\n\n') input_file.close() cmd = ['java'] + self.additional_java_args + ['-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser parsing (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) return DependencyGraph.load(output_file.name) finally: input_file.close() os.remove(input_file.name) output_file.close() os.remove(output_file.name) def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of ``DependencyGraph`` objects :param depgraphs: list of ``DependencyGraph`` objects for training input data """ input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll', dir=self.working_dir, delete=False) try: input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs)) input_file.write(input_str.encode("utf8")) input_file.close() self.train_from_file(input_file.name, verbose=verbose) finally: input_file.close() os.remove(input_file.name) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") # If conll_file is a ZipFilePathPointer, then we need to do some extra # massaging if isinstance(conll_file, ZipFilePathPointer): input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll', dir=self.working_dir, delete=False) try: conll_str = conll_file.open().read() conll_file.close() input_file.write(conll_str) input_file.close() return self.train_from_file(input_file.name, verbose=verbose) finally: input_file.close() os.remove(input_file.name) cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', conll_file, '-m', 'learn'] ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser training (%s) " "failed with exit code %d" % (' '.join(cmd), ret)) self._trained = True @staticmethod def _execute(cmd, verbose=False): output = None if verbose else subprocess.PIPE p = subprocess.Popen(cmd, stdout=output, stderr=output) return p.wait()
class MaltParser(ParserI): def __init__(self, tagger=None): self.config_malt() self.mco = 'malt_temp' self._trained = False if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) def config_malt(self, bin=None, verbose=False): """ Configure NLTK's interface to the ``malt`` package. This searches for a directory containing the malt jar :param bin: The full path to the ``malt`` binary. If not specified, then nltk will search the system for a ``malt`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """ #: A list of directories that should be searched for the malt #: executables. This list is used by ``config_malt`` when searching #: for the malt executables. _malt_path = [ '.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin', '/usr/local/malt-1*', '/usr/local/bin/malt-1*', '/usr/local/malt-1*', '/usr/local/share/malt-1*' ] # Expand wildcards in _malt_path: malt_path = reduce(add, map(glob.glob, _malt_path)) # Find the malt binary. self._malt_bin = find_binary( 'malt.jar', bin, searchpath=malt_path, env_vars=['MALTPARSERHOME'], url='http://w3.msi.vxu.se/~jha/maltparser/index.html', verbose=verbose) def parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: list(str) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ taggedwords = self.tagger.tag(sentence) return self.tagged_parse(taggedwords, verbose) def raw_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: str :return: ``DependencyGraph`` the dependency graph representation of the sentence """ words = word_tokenize(sentence) return self.parse(words, verbose) def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ if not self._malt_bin: raise Exception( "MaltParser location is not configured. Call config_malt() first." ) if not self._trained: raise Exception( "Parser has not been trained. Call train() first.") input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll') output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll') execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse' if not verbose: execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out") f = None try: f = open(input_file, 'w') for (i, (word, tag)) in enumerate(sentence): f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (i + 1, word, '_', tag, tag, '_', '0', 'a', '_', '_')) f.write('\n') f.close() cmd = [ 'java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse' ] self._execute(cmd, 'parse', verbose) return DependencyGraph.load(output_file) finally: if f: f.close() def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of ``DependencyGraph`` objects :param depgraphs: list of ``DependencyGraph`` objects for training input data """ input_file = os.path.join(tempfile.gettempdir(), 'malt_train.conll') f = None try: f = open(input_file, 'w') f.write('\n'.join([dg.to_conll(10) for dg in depgraphs])) finally: if f: f.close() self.train_from_file(input_file, verbose=verbose) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data """ if not self._malt_bin: raise Exception( "MaltParser location is not configured. Call config_malt() first." ) # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging f = None if hasattr(conll_file, 'zipfile'): zip_conll_file = conll_file conll_file = os.path.join(tempfile.gettempdir(), 'malt_train.conll') conll_str = zip_conll_file.open().read() f = open(conll_file, 'w') f.write(conll_str) f.close() cmd = [ 'java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % conll_file, '-m learn' ] # p = subprocess.Popen(cmd, stdout=subprocess.PIPE, # stderr=subprocess.STDOUT, # stdin=subprocess.PIPE) # (stdout, stderr) = p.communicate() self._execute(cmd, 'train', verbose) self._trained = True def _execute(self, cmd, type, verbose=False): if not verbose: temp_dir = os.path.join(tempfile.gettempdir(), '') cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' % ((temp_dir, type) * 2)) malt_exit = os.system(' '.join(cmd))
# are evaluated bottom up and thus, the last one defines the default tag patterns = [ (r".*ing$", "VBG"), # Gerunds (r".*ed$", "VBD"), # Simple past (r".*es$", "VBZ"), # 3rd singular present (r".*ould$", "MD"), # Modals (r".*'s$", "NN$"), # Possesive pronouns (r".*s$", "NNS"), # Plural nouns (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # Cardinal numbers (r".*", "NN") # Nouns (default) ] rt = RegexpTagger(regexps=patterns) print(rt.evaluate(test_data)) print(rt.tag(tokens)) # 3. N-GRAM TAGGERS: # Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes, # letters, characters or syllabes. Shingles: n-grams where items are just words. # UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger # Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations) ut = UnigramTagger(train=train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data))) print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data))) print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data)))
depparser = nltk.MaltParser( tagger=tagger, parser_dirname='D:\\Users\\Administrator\\Library\\maltparser-1.9.2') rc = nltk.DrtGlueReadingCommand(depparser=depparser) dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc) dt.readings() # TypeError: 'RegexpTagger' object is not callable # 估计是版本不匹配造成的 import nltk pattern = [(r'(March)$', 'MAR')] tagger = nltk.RegexpTagger(pattern) print(tagger.tag('He was born in March 1991')) print(tagger.tag(nltk.word_tokenize('He was born in March 1991'))) # 下面是短乎上给出的修改建议,测试了依然不行。 # 具体可参考 https://www.zhihu.com/people/meng-hui-wei-lai-de-colin/activities tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'), ('^(He)$', 'PRP')]) depparser = nltk.MaltParser( tagger=tagger.tag, parser_dirname='D:\\Users\\Administrator\\Library\\maltparser') rc = nltk.DrtGlueReadingCommand(depparser=depparser) dt = nltk.DiscourseTester( [sent.split() for sent in ['Every dog chases a boy']], reading_command=rc) dt.readings()
(r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NNS'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] rt = RegexpTagger(patterns) # accuracy on test data print(rt.evaluate(test_data)) # tagging our sample headline rt.tag(nltk.word_tokenize(sentence)) #%% ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # testing performance of unigram tagger print('unigram tagger: ') print(ut.evaluate(test_data)) print(ut.tag(nltk.word_tokenize(sentence)))
import nltk text = nltk.word_tokenize("It is refreshing to read a book about our planet by an author who does not allow \ facts to be __________ by politics: well aware of the political disputes about \ the effects of human activities on climate and biodiversity, this author does not \ permit them to __________ his comprehensive description of what we know \ about our biosphere. He emphasizes the enormous gaps in our knowledge, the \ sparseness of our observations, and the __________, calling attention to the \ many aspects of planetary evolution that must be better understood before we \ can accurately diagnose the condition of our planet.") # print nltk.pos_tag(text) from nltk.tag import RegexpTagger # define regex tag patterns patterns = [ (r'.*who$', 'Clause'), (r'.*what$', 'Clause'), (r'.*It$', 'Clause'), # (r'.*:$', 'Repeat'), # simple past (r'.*not$', 'Reverse'), # 3rd singular present (r'.*this$', 'Refer'), (r'.*them$', 'Refer'), (r'.*better$', 'Positive'), (r'.*dispute$', 'Negative'), (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) print rt.tag(text)