def getRegexpTaggerAccuracy(testingSet): # gets the accuracy of the RegexpTagger # get untagged sentences and gold POS tags untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # regular expressions adopted from nltk RegexepTagger documentation regexes = [ (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ] # declare tagger regexpTagger = RegexpTagger(regexes) # test tagger and get predicted POS tags regexpTaggedSentences = regexpTagger.tag_sents(untaggedSentences) regexpTaggedSentencesPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in regexpTaggedSentences] # calculate and return accuracy return calculateAccuracy(goldPOSTags, regexpTaggedSentencesPOSTags)
def malt_regex_tagger(): from nltk.tag import RegexpTagger _tagger = RegexpTagger([ (r"\.$", "."), (r"\,$", ","), (r"\?$", "?"), # fullstop, comma, Qmark (r"\($", "("), (r"\)$", ")"), # round brackets (r"\[$", "["), (r"\]$", "]"), # square brackets (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "DT"), # articles (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns (r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive (r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions (r"(till|Till|until|Until)$", "IN"), # time prepopsitions (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions (r"(under|Under|below|Below)$", "IN"), # space prepopsitions (r"(over|Over|above|Above)$", "IN"), # space prepopsitions (r"(across|Across|through|Through)$", "IN"), # space prepopsitions (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ]) return _tagger.tag
def malt_regex_tagger(): from nltk.tag import RegexpTagger _tagger = RegexpTagger([ (r'\.$', '.'), (r'\,$', ','), (r'\?$', '?'), # fullstop, comma, Qmark (r'\($', '('), (r'\)$', ')'), # round brackets (r'\[$', '['), (r'\]$', ']'), # square brackets (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'DT'), # articles (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive (r'(on|On|in|In|at|At|since|Since)$', 'IN'), # time prepopsitions (r'(for|For|ago|Ago|before|Before)$', 'IN'), # time prepopsitions (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions (r'(across|Across|through|Through)$', 'IN'), # space prepopsitions (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN'), # nouns (default) ]) return _tagger.tag
def __init__(self, tagger=None, mco=None, working_dir=None, additional_java_args=None): """ An interface for parsing with the Malt Parser. :param mco: The name of the pre-trained model. If provided, training will not be required, and MaltParser will use the model file in ${working_dir}/${mco}.mco. :type mco: str """ self.config_malt() self.mco = 'malt_temp' if mco is None else mco self.working_dir = tempfile.gettempdir() if working_dir is None\ else working_dir self.additional_java_args = [] if additional_java_args is None else additional_java_args self._trained = mco is not None if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ])
def template_comparison(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] init_tagger = RegexpTagger(patterns) templates = [ nltk.tag.brill.nltkdemo18(), nltk.tag.brill.nltkdemo18plus(), nltk.tag.brill.fntbl37(), nltk.tag.brill.brill24() ] evaluations = [] for t in templates: # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() tt = BrillTaggerTrainer(init_tagger, t, trace=3) currentTagger = tt.train(train_sentences) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) return evaluations
def meta_comparison(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] init_tagger = RegexpTagger(patterns) evaluations = [] for i in range(1, nb_iterations): # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tt = BrillTaggerTrainer(init_tagger, template, trace=3) currentTagger = tt.train(train_sentences, max_rules=i * 50) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) return evaluations
def test_regexp_tagger(self): tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(tagger), repr(decoded)) self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) self.assertEqual(tagger._regexps, decoded._regexps)
def demo(): discourse_demo() tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'), ('^(he)$', 'PRP')]) depparser = MaltParser(tagger=tagger) drt_discourse_demo( DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser))
def __init__(self, tagger=None): self.config_malt() self.mco = 'malt_temp' self._trained = False if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ])
def _init_glue(self): tagger = RegexpTagger([ ('^(David|Mary|John)$', 'NNP'), ('^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'), ('^(go|order|vanish|find|approach)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'), ('^(big|gray|former)$', 'JJ'), ('^(him|himself)$', 'PRP') ]) depparser = MaltParser(tagger=tagger) self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
def get_pos_tagger(): from nltk.corpus import treebank regexp_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = treebank.tagged_sents() unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) main_tagger = RegexpTagger([(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=bigram_tagger) return main_tagger
def demo(): discourse_demo() tagger = RegexpTagger([ ("^(chases|runs)$", "VB"), ("^(a)$", "ex_quant"), ("^(every)$", "univ_quant"), ("^(dog|boy)$", "NN"), ("^(he)$", "PRP"), ]) depparser = MaltParser(tagger=tagger) drt_discourse_demo( DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser))
def ARPosTag(self, List): patterns = [ ('^(الله|لله|ربنا|رب|إله)$','لفظ جلالة'), ('^(به|فيه|عنه|إليه|اليه|كل|بعض)$','حرف'), ('^(هذا|هذه|هذان|هاتان|هؤلاء|تلك|أولئك)$', 'اسم إشارة'), ('^(ثم|حتا|أو|أم|لكن|لا|مع)$', 'حرف عطف'), ('^(من|إلى|الى|عن|على|في|فى)$', 'حرف جر'), ('^(هى|هو|هي|هما|هم|هن)$', 'ضمير غائب'), ('^(أنت|أنتما|أنتم|أنتن|إياك|إياكما|إياكم|إياكن)$', 'ضمير متكلم'), ('^(كان|اصبح|أصبح|أمسى|امسى|ظل|اضحى|أضحى|بات|صار|ليس|ما زال|ما برح|ما انفك|ما دام|ما فتئ)$','كان وأخواتها'), ('^(إن|أن|ان|كأن|لكن|لعل|ليت)$','إن وأخواتها'), ('^(هل|من|أي|ما|ماذا|متى|أين|كيف|كم|لماذا|أنى|أيان)$', 'حرف /اسم استفهام'), ('^(حين|صباح|ظهر|ساعة|سنة|أمس|مساء)$', 'ظرف زمان'), ('^(فوق|تحت|أمام|وراء|حيث|دون)$', 'ظرف مكان'), ('^(الذي|التي|اللذان|اللتان|الذين|اللاتي|اللواتي|اللائي)$', 'اسم موصول'), ('([ا-ي]{3}ان)|([ا-ي]{3}ى)|([ا-ي]{3}ء)|[أا]حمر|[أا]صفر|[أا]خضر|رمادي|[أا]سود|[أا]زرق','صفة'), #('^([ا-ي]{2}ا[ا-ي])$|^([ا-ي]{2}و[ا-ي])$|^([ا-ي]{2}ي[ا-ي])$','صفة مشبهه باسم فاعل'), ('^([ا-ي]{3}ة)$|^(م[ا-ي]{2}و[ا-ي])$','اسم مفعول'), ('^(م[ا-ي]{3})$','اسمي الزمان والمكان'), ('^س?[نايت][ا-ي]{3,4}$|^[ا-ي]{3,4}$|^س?[نايت][ا-ي]ا[ا-ي]{2}$|^س?[نايت]ن[ا-ي]{3}$|^س?[نايت]ت[ا-ي]ا[ا-ي]{2}$|^[نايت]ست[ا-ي]{3}$|^[نايت]ت[ا-ي]{4}$','فعل'), ('^((وال)|(فال)|(بال)|(كال)|(ال)).+|^ت[ا-ي]{2}ي[ا-ي]$|^[ا-ي]{2}[واي][ا-ي]$', 'اسم'), ('.+((ائي)|(انك)|(انه)|(اؤك)|(اؤه)|(اءك)|(اءه)|(هما)|(كما)|(ات)|(ة))$|^[ا-ي]ا[ا-ي]{2}ة?$', 'اسم'), ('','اسم'), ] reg = RegexpTagger(patterns) tmpList = [] for k in List: tmp = araby.strip_tashkeel(k) tmp2='' for i in self.s2: if tmp.endswith(i): a=2 tmp2=tmp[0:-a] else: tmp2=tmp tmpList.append(tmp2) return reg.tag(tmpList)
def get_pos_tagger(self): regexp_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) #Override particular words main_tagger = RegexpTagger([(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger) return main_tagger
def demo(show_example=-1): from nltk.parse import MaltParser examples = [ "David sees Mary", "David eats a sandwich", "every man chases a dog", "every man believes a dog sleeps", "John gives David a sandwich", "John chases himself", ] # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', print("============== DEMO ==============") tagger = RegexpTagger( [ ("^(David|Mary|John)$", "NNP"), ( "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", "VB", ), ("^(go|order|vanish|find|approach)$", "VB"), ("^(a)$", "ex_quant"), ("^(every)$", "univ_quant"), ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), ("^(big|gray|former)$", "JJ"), ("^(him|himself)$", "PRP"), ] ) depparser = MaltParser(tagger=tagger) glue = Glue(depparser=depparser, verbose=False) for (i, sentence) in enumerate(examples): if i == show_example or show_example == -1: print(f"[[[Example {i}]]] {sentence}") for reading in glue.parse_to_meaning(sentence.split()): print(reading.simplify()) print("")
def demo(show_example=-1): from nltk.parse import MaltParser examples = [ 'David sees Mary', 'David eats a sandwich', 'every man chases a dog', 'every man believes a dog sleeps', 'John gives David a sandwich', 'John chases himself', ] # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', print('============== DEMO ==============') tagger = RegexpTagger( [ ('^(David|Mary|John)$', 'NNP'), ( '^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB', ), ('^(go|order|vanish|find|approach)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'), ('^(big|gray|former)$', 'JJ'), ('^(him|himself)$', 'PRP'), ] ) depparser = MaltParser(tagger=tagger) glue = Glue(depparser=depparser, verbose=False) for (i, sentence) in enumerate(examples): if i == show_example or show_example == -1: print('[[[Example %s]]] %s' % (i, sentence)) for reading in glue.parse_to_meaning(sentence.split()): print(reading.simplify()) print('')
def __init__(self, tagger=None): self.config_malt() self.mco = 'malt_temp' self._trained = False if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ])
def question4(): #Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] train_words = treebank.words() init_tagger = RegexpTagger(patterns) #Not sure if we need to use BrillTagger or BrillTaggerTrainer?? #tagger = BrillTagger(init_tagger) # tagger = BrillTaggerTrainer(init_tagger) return
def Brill_recursion(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] # init_tagger = CRFTagger(feature_func=feature_func) # init_tagger.train(train_sentences, 'model.crf.tagger') init_tagger = RegexpTagger(patterns) currentTagger = None current_evaluation = 0.0 evaluations = [] for i in range(nb_iterations): #Not sure if we need to use BrillTagger or BrillTaggerTrainer?? #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] if i == 0: tt = BrillTaggerTrainer(init_tagger, templates, trace=3) currentTagger = tt.train(train_sentences) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) else: tt = BrillTaggerTrainer(currentTagger, templates, trace=3) tagger = tt.train(train_sentences) current_evaluation = tagger.evaluate(test_sentences) evaluations.append(current_evaluation) currentTagger = tagger print(current_evaluation) return evaluations
class MaltParser(ParserI): def __init__(self, tagger=None, mco=None, working_dir=None, additional_java_args=None): """ An interface for parsing with the Malt Parser. :param mco: The name of the pre-trained model. If provided, training will not be required, and MaltParser will use the model file in ${working_dir}/${mco}.mco. :type mco: str """ self.config_malt() self.mco = 'malt_temp' if mco is None else mco self.working_dir = tempfile.gettempdir() if working_dir is None\ else working_dir self.additional_java_args = [] if additional_java_args is None else additional_java_args self._trained = mco is not None if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) def config_malt(self, bin=None, verbose=False): """ Configure NLTK's interface to the ``malt`` package. This searches for a directory containing the malt jar :param bin: The full path to the ``malt`` binary. If not specified, then nltk will search the system for a ``malt`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """ #: A list of directories that should be searched for the malt #: executables. This list is used by ``config_malt`` when searching #: for the malt executables. _malt_path = ['.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin', '/usr/local/malt-1*', '/usr/local/bin/malt-1*', '/usr/local/malt-1*', '/usr/local/share/malt-1*'] # Expand wildcards in _malt_path: malt_path = reduce(add, map(glob.glob, _malt_path)) # Find the malt binary. self._malt_bin = find_binary('malt.jar', bin, searchpath=malt_path, env_vars=['MALTPARSERHOME'], url='http://www.maltparser.org/', verbose=verbose) def parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: list(str) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ return self.batch_parse([sentence], verbose)[0] def batch_parse(self, sentences, verbose=False): """ Use MaltParser to parse multiple sentence. Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this MaltParser instance's tagger. :param sentences: Input sentences to parse :type sentence: list(list(str)) :return: list(``DependencyGraph``) the dependency graph representation of each sentence """ tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences] return self.tagged_batch_parse(tagged_sentences, verbose) def raw_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: str :return: ``DependencyGraph`` the dependency graph representation of the sentence """ words = word_tokenize(sentence) return self.parse(words, verbose) def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ return self.tagged_batch_parse([sentence], verbose)[0] def tagged_batch_parse(self, sentences, verbose=False): """ Use MaltParser to parse multiple sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: list(``DependencyGraph``) the dependency graph representation of each sentence """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") if not self._trained: raise Exception("Parser has not been trained. Call train() first.") input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=self.working_dir, delete=False) output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=self.working_dir, delete=False) try: for sentence in sentences: for (i, (word, tag)) in enumerate(sentence, start=1): input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\ (i, word, '_', tag, tag, '_', '0', 'a', '_', '_') input_file.write(input_str.encode("utf8")) input_file.write(b'\n\n') input_file.close() cmd = ['java'] + self.additional_java_args + ['-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser parsing (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) return DependencyGraph.load(output_file.name) finally: input_file.close() os.remove(input_file.name) output_file.close() os.remove(output_file.name) def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of ``DependencyGraph`` objects :param depgraphs: list of ``DependencyGraph`` objects for training input data """ input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll', dir=self.working_dir, delete=False) try: input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs)) input_file.write(input_str.encode("utf8")) input_file.close() self.train_from_file(input_file.name, verbose=verbose) finally: input_file.close() os.remove(input_file.name) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") # If conll_file is a ZipFilePathPointer, then we need to do some extra # massaging if isinstance(conll_file, ZipFilePathPointer): input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll', dir=self.working_dir, delete=False) try: conll_str = conll_file.open().read() conll_file.close() input_file.write(conll_str) input_file.close() return self.train_from_file(input_file.name, verbose=verbose) finally: input_file.close() os.remove(input_file.name) cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', conll_file, '-m', 'learn'] ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser training (%s) " "failed with exit code %d" % (' '.join(cmd), ret)) self._trained = True @staticmethod def _execute(cmd, verbose=False): output = None if verbose else subprocess.PIPE p = subprocess.Popen(cmd, stdout=output, stderr=output) return p.wait()
testcurve = [1 - x/teststats['tokencount'] for x in testcurve[:take]] traincurve = [trainstats['initialerrors']] for rulescore in trainstats['rulescores']: traincurve.append(traincurve[-1] - rulescore) traincurve = [1 - x/trainstats['tokencount'] for x in traincurve[:take]] import matplotlib.pyplot as plt r = list(range(len(testcurve))) plt.plot(r, testcurve, r, traincurve) plt.axis([None, None, None, 1.0]) plt.savefig(learning_curve_output) NN_CD_TAGGER = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) REGEXP_TAGGER = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ])
dt.retract_sentence('No person dances') dt.retract_sentence('No person dances', verbose=True) dt.readings() # 给段落增加句子,使用“informchk=True”就可以对加入的句子进行信息量检查(即是否增加了新的信息量) dt.add_sentence('A person dances', informchk=True) dt.readings() # discourse模型可以适应语义歧义,筛选出不可接受的读法。 # Glue语义模型被配置为使用覆盖面广泛的Malt依存关系分析器,输入的句子必须已经完成了分词和标注。 # MaltParser()需要去 http://www.maltparser.org/mco/mco.html 下载 MaltParser, # 然后解压缩到合适的目录下,使用 parser_dirname 来设置目录 from nltk.tag import RegexpTagger tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'), ('^(He)$', 'PRP')]) depparser = nltk.MaltParser( tagger=tagger, parser_dirname='D:\\Users\\Administrator\\Library\\maltparser') depparser = nltk.MaltParser( tagger=tagger, parser_dirname='D:\\Users\\Administrator\\Library\\maltparser-1.9.2') rc = nltk.DrtGlueReadingCommand(depparser=depparser) dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc) dt.readings() # TypeError: 'RegexpTagger' object is not callable # 估计是版本不匹配造成的
# regex tagger from nltk.tag import RegexpTagger # define regex tag patterns patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) print rt.evaluate(test_data) print rt.tag(tokens) ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data)
''' from nltk.tag import RegexpTagger, untag, UnigramTagger, BigramTagger, TrigramTagger, DefaultTagger, AffixTagger, RegexpTagger from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.corpus import brown, treebank, conll2000 from tag_util import backoff_tagger, train_brill_tagger import pickle # train_sents = brown.tagged_sents(categories=['news'])[:40000] # test_sents = brown.tagged_sents(categories=['news']) [40000:50000] train_sents = conll2000.tagged_sents() # some regex pattern that will be used for the RegexpTagger regex_pattern = [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'mad', 'JJ'), (r'^a$', 'PREP')] initial_tagger = backoff_tagger( train_sents, [AffixTagger, UnigramTagger, BigramTagger, TrigramTagger], backoff=RegexpTagger(regex_pattern)) # Training the Brill Tagger brill_tagger = train_brill_tagger(initial_tagger, train_sents) #print brill_tagger.evaluate(test_sents) # Save pickle for the later use f = open('brill_tagger.pickle', 'wb') pickle.dump(brill_tagger, f) f.close()
class MaltParser(ParserI): def __init__(self, tagger=None): self.config_malt() self.mco = 'malt_temp' self._trained = False if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) def config_malt(self, bin=None, verbose=False): """ Configure NLTK's interface to the ``malt`` package. This searches for a directory containing the malt jar :param bin: The full path to the ``malt`` binary. If not specified, then nltk will search the system for a ``malt`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """ #: A list of directories that should be searched for the malt #: executables. This list is used by ``config_malt`` when searching #: for the malt executables. _malt_path = [ '.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin', '/usr/local/malt-1*', '/usr/local/bin/malt-1*', '/usr/local/malt-1*', '/usr/local/share/malt-1*' ] # Expand wildcards in _malt_path: malt_path = reduce(add, map(glob.glob, _malt_path)) # Find the malt binary. self._malt_bin = find_binary( 'malt.jar', bin, searchpath=malt_path, env_vars=['MALTPARSERHOME'], url='http://w3.msi.vxu.se/~jha/maltparser/index.html', verbose=verbose) def parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: list(str) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ taggedwords = self.tagger.tag(sentence) return self.tagged_parse(taggedwords, verbose) def raw_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: str :return: ``DependencyGraph`` the dependency graph representation of the sentence """ words = word_tokenize(sentence) return self.parse(words, verbose) def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ if not self._malt_bin: raise Exception( "MaltParser location is not configured. Call config_malt() first." ) if not self._trained: raise Exception( "Parser has not been trained. Call train() first.") input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll') output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll') execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse' if not verbose: execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out") f = None try: f = open(input_file, 'w') for (i, (word, tag)) in enumerate(sentence): f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (i + 1, word, '_', tag, tag, '_', '0', 'a', '_', '_')) f.write('\n') f.close() cmd = [ 'java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse' ] self._execute(cmd, 'parse', verbose) return DependencyGraph.load(output_file) finally: if f: f.close() def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of ``DependencyGraph`` objects :param depgraphs: list of ``DependencyGraph`` objects for training input data """ input_file = os.path.join(tempfile.gettempdir(), 'malt_train.conll') f = None try: f = open(input_file, 'w') f.write('\n'.join([dg.to_conll(10) for dg in depgraphs])) finally: if f: f.close() self.train_from_file(input_file, verbose=verbose) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data """ if not self._malt_bin: raise Exception( "MaltParser location is not configured. Call config_malt() first." ) # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging f = None if hasattr(conll_file, 'zipfile'): zip_conll_file = conll_file conll_file = os.path.join(tempfile.gettempdir(), 'malt_train.conll') conll_str = zip_conll_file.open().read() f = open(conll_file, 'w') f.write(conll_str) f.close() cmd = [ 'java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % conll_file, '-m learn' ] # p = subprocess.Popen(cmd, stdout=subprocess.PIPE, # stderr=subprocess.STDOUT, # stdin=subprocess.PIPE) # (stdout, stderr) = p.communicate() self._execute(cmd, 'train', verbose) self._trained = True def _execute(self, cmd, type, verbose=False): if not verbose: temp_dir = os.path.join(tempfile.gettempdir(), '') cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' % ((temp_dir, type) * 2)) malt_exit = os.system(' '.join(cmd))
(r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] for split in splits: test_brown, train_brown = train_test_split(correct_brown, test_size=split[1] / 100, shuffle=False) test_chat, train_chat = train_test_split(correct_chat, test_size=split[1] / 100, shuffle=False) # brown regex_tagger_brown = RegexpTagger(patterns, backoff=default_tagger_brown) unigram_tagger_brown = UnigramTagger(train_brown, backoff=regex_tagger_brown) bigram_tagger_brown = BigramTagger(train_brown, backoff=unigram_tagger_brown) print(f"--------- BROWN CORPUS TAGGING {split[0]}/{split[1]}---------\n") print( f"The BigramTagger accuracy for the Brown Corpus is {round(bigram_tagger_brown.evaluate(test_brown),3)}" ) print( f"The UnigramTagger accuracy for the Brown Corpus is {round(unigram_tagger_brown.evaluate(test_brown),3)}" ) print( f"The RegexpTagger accuracy for the Brown Corpus is {round(regex_tagger_brown.evaluate(test_brown),3)}" )
tag to each word """ from nltk.tag import DefaultTagger dt = DefaultTagger("NN") #Accuracy on test data print(dt.evaluate(test_data)) print(dt.tag(tokens)) from nltk.tag import RegexpTagger #define regex tag patterns patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')] rt = RegexpTagger(patterns) # accuracy on test data print(rt.evaluate(test_data)) from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) #testing perfomence of unigram tagger print(ut.evaluate(test_data)) print(ut.tag(tokens))
def __init__(self, backoff=None): print("Regexp") RegexpTagger.__init__(self, patterns, backoff=backoff)
# Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size * train) # Training set training_data = tagged_data_list[:cutoff] # Evaluation set evaulation_data = tagged_data_list[cutoff:development_size] # print "Data is splitted!" # Regular expression tagger nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'), (r'.*', 'NOUN_NOM')]) # Unigram tagger unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger) print "Unigram accuracy: " print unigram_tagger.evaluate(evaulation_data) # Bigram tagger bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) print "Bigram accuracy: " print bigram_tagger.evaluate(evaulation_data) # Trigram tagger trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger) print "Trigram accuracy: " print trigram_tagger.evaluate(evaulation_data)
def setRegexPatterns(self, regex): if regex == "patterns": return RegexpTagger(self.patterns) elif regex == "modified": return RegexpTagger(self.patternsModified)
regextagger = RegexpTagger([ # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kmph|km/hr|kms/hr|kilometers/hr|kilometer/hr)$)','KMPH'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(cm|cms|centimeter|centimeters|centimetre|centimetres)$)', 'CM'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(km|kms|kilometer|kilometers|kilometre|kilometres)$)', 'KM'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(mm|mms|millimeter|millimeters|millimetre|millimetres)$)', 'MM'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(m|meter|meters|metre|metres)$)', 'METER'), (r"(?i)(^[0-9]+(.[0-9]+)?\s*(ft|feet|foot|\')$)", 'FEET'), # (r'(?i)(^[0-9]+(.[0-9]+)?\s*(inch|inches|\")$)','INCH'), # (r'(?i)(^[0-9]+(.[0-9]+)?\s*(yard|yards|yd|yds)$)','YARD'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(mile|miles)$)', 'MILE'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(second|seconds|s|sec|secs)$)', 'SEC'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(minute|minutes|mins|min)$)', 'MIN'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(hour|hours|hr|hrs)$)', 'HOUR'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(day|days)$)', 'DAY'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(year|years|yr|yrs)$)', 'YEAR'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(month|months)$)', 'MONTH'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(week|weeks|wk|wks)$)', 'WEEK'), (r'(?i)(^[0-9]+(.[0-9]+)?\s*(gram|gramme|gm|gms|g|grams|grammes|gs)$)', 'GRAM'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kilogram|kilogramme|kg|kilograms|kilogrammes|kgs)$)', 'KG'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(milligram|milligramme|mg|milligrams|milligrammes|mgs)$)', 'MG'), # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(ton|tons|tonne|tonnes)$)','TON'), # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(pounds|pound|lb|lbs)$)','POUND'), # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(pounds|pound|lb|lbs)$)','LITRE'), # \(r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(pounds|pound|lb|lbs)$)','GALLON'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(celcius|c|deg.celcius|deg.c)$)', 'CELCIUS'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(farenheit|f|deg.farenheit|deg.f|degree|deg)$)', 'FARENHEIT'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kelvin|k|deg.kelvin|deg.k)$)', 'KELVIN'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(volt|volts|V)$)', 'VOLTS'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(ampere|amperes|A|amps|amp)$)', 'AMPS'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(watt|watts|W)$)', 'WATT'), (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kilowatt|kilowatts|kW)$)', 'kW'), (r'.*', 'OTHER') ])
(r'SOUTH|South|south', 'SOUTH') # ('TICKET','VALIDITY') ] # add learning loop here for tags def_tagger = DefaultTagger('NN') prelim_def_tagger = DefaultTagger(None) backoff = RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'is|was|are|were', 'VBZ'), # verb to be (r'"', 'QT'), # quote (r'.*', 'NN') # nouns (default) ], backoff=def_tagger) cal2 = {v.upper(): k for k, v in enumerate(calendar.month_abbr)} cal2.update({v: k for k, v in enumerate(calendar.month_abbr)}) cal2.update({v.upper(): k for k, v in enumerate(calendar.month_name)}) cal2.update({v: k for k, v in enumerate(calendar.month_name)}) del cal2[""] #remove blank string keyupdat monthModel = {} monthModel = {k: 'MM' for k, v in cal2.items()}
def __init__(self, backoff=None): RegexpTagger.__init__(self, patterns, backoff=backoff)
class MaltParser(ParserI): def __init__(self, tagger=None): self.config_malt() self.mco = 'malt_temp' self._trained = False if tagger is not None: self.tagger = tagger else: self.tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) def config_malt(self, bin=None, verbose=False): """ Configure NLTK's interface to the C{malt} package. This searches for a directory containing the malt jar :param bin: The full path to the C{malt} binary. If not specified, then nltk will search the system for a C{malt} binary; and if one is not found, it will raise a C{LookupError} exception. :type bin: str """ #: A list of directories that should be searched for the malt #: executables. This list is used by L{config_malt} when searching #: for the malt executables. _malt_path = ['.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin', '/usr/local/malt-1*', '/usr/local/bin/malt-1*', '/usr/local/malt-1*', '/usr/local/share/malt-1*'] # Expand wildcards in _malt_path: malt_path = reduce(add, map(glob.glob, _malt_path)) # Find the malt binary. self._malt_bin = find_binary('malt.jar', bin, searchpath=malt_path, env_vars=['MALTPARSERHOME'], url='http://w3.msi.vxu.se/~jha/maltparser/index.html', verbose=verbose) def parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: L{list} of L{string} :return: C{DependencyGraph} the dependency graph representation of the sentence """ taggedwords = self.tagger.tag(sentence) return self.tagged_parse(taggedwords, verbose) def raw_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged with this MaltParser instance's tagger. :param sentence: Input sentence to parse :type sentence: L{string} :return: C{DependencyGraph} the dependency graph representation of the sentence """ words = word_tokenize(sentence) return self.parse(words, verbose) def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: L{list} of (word, tag) L{tuple}s. :return: C{DependencyGraph} the dependency graph representation of the sentence """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") if not self._trained: raise Exception("Parser has not been trained. Call train() first.") input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll') output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll') execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse' if not verbose: execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out") f = None try: f = open(input_file, 'w') for (i, (word,tag)) in enumerate(sentence): f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (i+1, word, '_', tag, tag, '_', '0', 'a', '_', '_')) f.write('\n') f.close() cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse'] self._execute(cmd, 'parse', verbose) return DependencyGraph.load(output_file) finally: if f: f.close() def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of C{DependencyGraph}s :param depgraphs: list of C{DependencyGraph}s for training input data """ input_file = os.path.join(tempfile.gettempdir(),'malt_train.conll') f = None try: f = open(input_file, 'w') f.write('\n'.join([dg.to_conll(10) for dg in depgraphs])) finally: if f: f.close() self.train_from_file(input_file, verbose=verbose) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging f = None if hasattr(conll_file, 'zipfile'): zip_conll_file = conll_file conll_file = os.path.join(tempfile.gettempdir(),'malt_train.conll') conll_str = zip_conll_file.open().read() f = open(conll_file,'w') f.write(conll_str) f.close() cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % conll_file, '-m learn'] # p = subprocess.Popen(cmd, stdout=subprocess.PIPE, # stderr=subprocess.STDOUT, # stdin=subprocess.PIPE) # (stdout, stderr) = p.communicate() self._execute(cmd, 'train', verbose) self._trained = True def _execute(self, cmd, type, verbose=False): if not verbose: temp_dir = os.path.join(tempfile.gettempdir(), '') cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' % ((temp_dir, type)*2)) malt_exit = os.system(' '.join(cmd))
from nltk.tag import RegexpTagger from tag_util import patterns, test_sents tagger = RegexpTagger(patterns) print(tagger.evaluate(test_sents))
testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]] traincurve = [trainstats["initialerrors"]] for rulescore in trainstats["rulescores"]: traincurve.append(traincurve[-1] - rulescore) traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]] import matplotlib.pyplot as plt r = list(range(len(testcurve))) plt.plot(r, testcurve, r, traincurve) plt.axis([None, None, None, 1.0]) plt.savefig(learning_curve_output) NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")]) REGEXP_TAGGER = RegexpTagger([ (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ]) def corpus_size(seqs):
# Assigns tags to tokens by comparing their word strings to a series of regular expressions # Define regex patterns used that determine the tags of tokens. Note that when tagging a token, expressions # are evaluated bottom up and thus, the last one defines the default tag patterns = [ (r".*ing$", "VBG"), # Gerunds (r".*ed$", "VBD"), # Simple past (r".*es$", "VBZ"), # 3rd singular present (r".*ould$", "MD"), # Modals (r".*'s$", "NN$"), # Possesive pronouns (r".*s$", "NNS"), # Plural nouns (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # Cardinal numbers (r".*", "NN") # Nouns (default) ] rt = RegexpTagger(regexps=patterns) print(rt.evaluate(test_data)) print(rt.tag(tokens)) # 3. N-GRAM TAGGERS: # Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes, # letters, characters or syllabes. Shingles: n-grams where items are just words. # UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger # Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations) ut = UnigramTagger(train=train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger