Beispiel #1
0
def getRegexpTaggerAccuracy(testingSet):
    # gets the accuracy of the RegexpTagger

    # get untagged sentences and gold POS tags
    untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet]
    goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet]

    # regular expressions adopted from nltk RegexepTagger documentation
    regexes = [
        (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
        (r"(The|the|A|a|An|an)$", "AT"),  # articles
        (r".*able$", "JJ"),  # adjectives
        (r".*ness$", "NN"),  # nouns formed from adjectives
        (r".*ly$", "RB"),  # adverbs
        (r".*s$", "NNS"),  # plural nouns
        (r".*ing$", "VBG"),  # gerunds
        (r".*ed$", "VBD"),  # past tense verbs
        (r".*", "NN"),  # nouns (default)
    ]

    # declare tagger
    regexpTagger = RegexpTagger(regexes)

    # test tagger and get predicted POS tags
    regexpTaggedSentences = regexpTagger.tag_sents(untaggedSentences)
    regexpTaggedSentencesPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in regexpTaggedSentences]

    # calculate and return accuracy
    return calculateAccuracy(goldPOSTags, regexpTaggedSentencesPOSTags)
Beispiel #2
0
def malt_regex_tagger():
    from nltk.tag import RegexpTagger

    _tagger = RegexpTagger([
        (r"\.$", "."),
        (r"\,$", ","),
        (r"\?$", "?"),  # fullstop, comma, Qmark
        (r"\($", "("),
        (r"\)$", ")"),  # round brackets
        (r"\[$", "["),
        (r"\]$", "]"),  # square brackets
        (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
        (r"(The|the|A|a|An|an)$", "DT"),  # articles
        (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"),  # pronouns
        (r"(His|his|Her|her|Its|its)$", "PRP$"),  # possessive
        (r"(my|Your|your|Yours|yours)$", "PRP$"),  # possessive
        (r"(on|On|in|In|at|At|since|Since)$", "IN"),  # time prepopsitions
        (r"(for|For|ago|Ago|before|Before)$", "IN"),  # time prepopsitions
        (r"(till|Till|until|Until)$", "IN"),  # time prepopsitions
        (r"(by|By|beside|Beside)$", "IN"),  # space prepopsitions
        (r"(under|Under|below|Below)$", "IN"),  # space prepopsitions
        (r"(over|Over|above|Above)$", "IN"),  # space prepopsitions
        (r"(across|Across|through|Through)$", "IN"),  # space prepopsitions
        (r"(into|Into|towards|Towards)$", "IN"),  # space prepopsitions
        (r"(onto|Onto|from|From)$", "IN"),  # space prepopsitions
        (r".*able$", "JJ"),  # adjectives
        (r".*ness$", "NN"),  # nouns formed from adjectives
        (r".*ly$", "RB"),  # adverbs
        (r".*s$", "NNS"),  # plural nouns
        (r".*ing$", "VBG"),  # gerunds
        (r".*ed$", "VBD"),  # past tense verbs
        (r".*", "NN"),  # nouns (default)
    ])
    return _tagger.tag
Beispiel #3
0
def malt_regex_tagger():
    from nltk.tag import RegexpTagger

    _tagger = RegexpTagger([
        (r'\.$', '.'),
        (r'\,$', ','),
        (r'\?$', '?'),  # fullstop, comma, Qmark
        (r'\($', '('),
        (r'\)$', ')'),  # round brackets
        (r'\[$', '['),
        (r'\]$', ']'),  # square brackets
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'DT'),  # articles
        (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'),  # pronouns
        (r'(His|his|Her|her|Its|its)$', 'PRP$'),  # possesive
        (r'(my|Your|your|Yours|yours)$', 'PRP$'),  # possesive
        (r'(on|On|in|In|at|At|since|Since)$', 'IN'),  # time prepopsitions
        (r'(for|For|ago|Ago|before|Before)$', 'IN'),  # time prepopsitions
        (r'(till|Till|until|Until)$', 'IN'),  # time prepopsitions
        (r'(by|By|beside|Beside)$', 'IN'),  # space prepopsitions
        (r'(under|Under|below|Below)$', 'IN'),  # space prepopsitions
        (r'(over|Over|above|Above)$', 'IN'),  # space prepopsitions
        (r'(across|Across|through|Through)$', 'IN'),  # space prepopsitions
        (r'(into|Into|towards|Towards)$', 'IN'),  # space prepopsitions
        (r'(onto|Onto|from|From)$', 'IN'),  # space prepopsitions
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'.*', 'NN'),  # nouns (default)
    ])
    return _tagger.tag
Beispiel #4
0
    def __init__(self, tagger=None, mco=None, working_dir=None, additional_java_args=None):
        """
        An interface for parsing with the Malt Parser.

        :param mco: The name of the pre-trained model. If provided, training
            will not be required, and MaltParser will use the model file in
            ${working_dir}/${mco}.mco.
        :type mco: str
        """
        self.config_malt()
        self.mco = 'malt_temp' if mco is None else mco
        self.working_dir = tempfile.gettempdir() if working_dir is None\
                           else working_dir
        self.additional_java_args = [] if additional_java_args is None else additional_java_args
        self._trained = mco is not None

        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
             ])
Beispiel #5
0
def template_comparison(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    init_tagger = RegexpTagger(patterns)
    templates = [
        nltk.tag.brill.nltkdemo18(),
        nltk.tag.brill.nltkdemo18plus(),
        nltk.tag.brill.fntbl37(),
        nltk.tag.brill.brill24()
    ]
    evaluations = []

    for t in templates:
        # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()

        tt = BrillTaggerTrainer(init_tagger, t, trace=3)
        currentTagger = tt.train(train_sentences)
        current_evaluation = currentTagger.evaluate(test_sentences)
        evaluations.append(current_evaluation)

    return evaluations
Beispiel #6
0
def meta_comparison(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    init_tagger = RegexpTagger(patterns)

    evaluations = []

    for i in range(1, nb_iterations):
        # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        tt = BrillTaggerTrainer(init_tagger, template, trace=3)
        currentTagger = tt.train(train_sentences, max_rules=i * 50)
        current_evaluation = currentTagger.evaluate(test_sentences)
        evaluations.append(current_evaluation)

    return evaluations
    def test_regexp_tagger(self):
        tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger)

        encoded = self.encoder.encode(tagger)
        decoded = self.decoder.decode(encoded)

        self.assertEqual(repr(tagger), repr(decoded))
        self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
        self.assertEqual(tagger._regexps, decoded._regexps)
Beispiel #8
0
def demo():
    discourse_demo()

    tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'),
                           ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'),
                           ('^(he)$', 'PRP')])
    depparser = MaltParser(tagger=tagger)
    drt_discourse_demo(
        DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser))
Beispiel #9
0
    def __init__(self, tagger=None):
        self.config_malt()
        self.mco = 'malt_temp'
        self._trained = False

        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger([
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN')  # nouns (default)
            ])
Beispiel #10
0
    def _init_glue(self):
        tagger = RegexpTagger([
            ('^(David|Mary|John)$', 'NNP'),
            ('^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$',
             'VB'), ('^(go|order|vanish|find|approach)$', 'VB'),
            ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'),
            ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
            ('^(big|gray|former)$', 'JJ'), ('^(him|himself)$', 'PRP')
        ])

        depparser = MaltParser(tagger=tagger)
        self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
Beispiel #11
0
def get_pos_tagger():
    from nltk.corpus import treebank
    regexp_tagger = RegexpTagger([
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'.*', 'NN')  # nouns (default)
    ])
    brown_train = treebank.tagged_sents()
    unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
    bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)

    main_tagger = RegexpTagger([(r'(A|a|An|an)$', 'ex_quant'),
                                (r'(Every|every|All|all)$', 'univ_quant')],
                               backoff=bigram_tagger)

    return main_tagger
Beispiel #12
0
def demo():
    discourse_demo()

    tagger = RegexpTagger([
        ("^(chases|runs)$", "VB"),
        ("^(a)$", "ex_quant"),
        ("^(every)$", "univ_quant"),
        ("^(dog|boy)$", "NN"),
        ("^(he)$", "PRP"),
    ])
    depparser = MaltParser(tagger=tagger)
    drt_discourse_demo(
        DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser))
Beispiel #13
0
    def ARPosTag(self, List):        
        patterns = [
            ('^(الله|لله|ربنا|رب|إله)$','لفظ جلالة'),
            ('^(به|فيه|عنه|إليه|اليه|كل|بعض)$','حرف'),
            ('^(هذا|هذه|هذان|هاتان|هؤلاء|تلك|أولئك)$', 'اسم إشارة'),
            ('^(ثم|حتا|أو|أم|لكن|لا|مع)$', 'حرف عطف'),
            ('^(من|إلى|الى|عن|على|في|فى)$', 'حرف جر'),
            ('^(هى|هو|هي|هما|هم|هن)$', 'ضمير غائب'),
            ('^(أنت|أنتما|أنتم|أنتن|إياك|إياكما|إياكم|إياكن)$', 'ضمير متكلم'),
            ('^(كان|اصبح|أصبح|أمسى|امسى|ظل|اضحى|أضحى|بات|صار|ليس|ما زال|ما برح|ما انفك|ما دام|ما فتئ)$','كان وأخواتها'),
            ('^(إن|أن|ان|كأن|لكن|لعل|ليت)$','إن وأخواتها'),
            ('^(هل|من|أي|ما|ماذا|متى|أين|كيف|كم|لماذا|أنى|أيان)$', 'حرف /اسم استفهام'),
            ('^(حين|صباح|ظهر|ساعة|سنة|أمس|مساء)$', 'ظرف زمان'),
            ('^(فوق|تحت|أمام|وراء|حيث|دون)$', 'ظرف مكان'),
            ('^(الذي|التي|اللذان|اللتان|الذين|اللاتي|اللواتي|اللائي)$', 'اسم موصول'),
            ('([ا-ي]{3}ان)|([ا-ي]{3}ى)|([ا-ي]{3}ء)|[أا]حمر|[أا]صفر|[أا]خضر|رمادي|[أا]سود|[أا]زرق','صفة'),
            #('^([ا-ي]{2}ا[ا-ي])$|^([ا-ي]{2}و[ا-ي])$|^([ا-ي]{2}ي[ا-ي])$','صفة مشبهه باسم فاعل'),
            ('^([ا-ي]{3}ة)$|^(م[ا-ي]{2}و[ا-ي])$','اسم مفعول'),
            ('^(م[ا-ي]{3})$','اسمي الزمان والمكان'),
            ('^س?[نايت][ا-ي]{3,4}$|^[ا-ي]{3,4}$|^س?[نايت][ا-ي]ا[ا-ي]{2}$|^س?[نايت]ن[ا-ي]{3}$|^س?[نايت]ت[ا-ي]ا[ا-ي]{2}$|^[نايت]ست[ا-ي]{3}$|^[نايت]ت[ا-ي]{4}$','فعل'),
            ('^((وال)|(فال)|(بال)|(كال)|(ال)).+|^ت[ا-ي]{2}ي[ا-ي]$|^[ا-ي]{2}[واي][ا-ي]$', 'اسم'),
            ('.+((ائي)|(انك)|(انه)|(اؤك)|(اؤه)|(اءك)|(اءه)|(هما)|(كما)|(ات)|(ة))$|^[ا-ي]ا[ا-ي]{2}ة?$', 'اسم'),
            ('','اسم'),
        ]
        reg = RegexpTagger(patterns)

        tmpList = []
        for k in List:
            tmp = araby.strip_tashkeel(k)
            tmp2=''
            for i in self.s2:
                if tmp.endswith(i):
                    a=2
                    tmp2=tmp[0:-a]
                else:
                    tmp2=tmp
            tmpList.append(tmp2)        
        return reg.tag(tmpList)  
Beispiel #14
0
    def get_pos_tagger(self):
        regexp_tagger = RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN')  # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger([(r'(A|a|An|an)$', 'ex_quant'),
                                    (r'(Every|every|All|all)$', 'univ_quant')],
                                   backoff=trigram_tagger)

        return main_tagger
def demo(show_example=-1):
    from nltk.parse import MaltParser

    examples = [
        "David sees Mary",
        "David eats a sandwich",
        "every man chases a dog",
        "every man believes a dog sleeps",
        "John gives David a sandwich",
        "John chases himself",
    ]
    #                'John persuades David to order a pizza',
    #                'John tries to go',
    #                'John tries to find a unicorn',
    #                'John seems to vanish',
    #                'a unicorn seems to approach',
    #                'every big cat leaves',
    #                'every gray cat leaves',
    #                'every big gray cat leaves',
    #                'a former senator leaves',

    print("============== DEMO ==============")

    tagger = RegexpTagger(
        [
            ("^(David|Mary|John)$", "NNP"),
            (
                "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
                "VB",
            ),
            ("^(go|order|vanish|find|approach)$", "VB"),
            ("^(a)$", "ex_quant"),
            ("^(every)$", "univ_quant"),
            ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
            ("^(big|gray|former)$", "JJ"),
            ("^(him|himself)$", "PRP"),
        ]
    )

    depparser = MaltParser(tagger=tagger)
    glue = Glue(depparser=depparser, verbose=False)

    for (i, sentence) in enumerate(examples):
        if i == show_example or show_example == -1:
            print(f"[[[Example {i}]]]  {sentence}")
            for reading in glue.parse_to_meaning(sentence.split()):
                print(reading.simplify())
            print("")
Beispiel #16
0
def demo(show_example=-1):
    from nltk.parse import MaltParser

    examples = [
        'David sees Mary',
        'David eats a sandwich',
        'every man chases a dog',
        'every man believes a dog sleeps',
        'John gives David a sandwich',
        'John chases himself',
    ]
    #                'John persuades David to order a pizza',
    #                'John tries to go',
    #                'John tries to find a unicorn',
    #                'John seems to vanish',
    #                'a unicorn seems to approach',
    #                'every big cat leaves',
    #                'every gray cat leaves',
    #                'every big gray cat leaves',
    #                'a former senator leaves',

    print('============== DEMO ==============')

    tagger = RegexpTagger(
        [
            ('^(David|Mary|John)$', 'NNP'),
            (
                '^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$',
                'VB',
            ),
            ('^(go|order|vanish|find|approach)$', 'VB'),
            ('^(a)$', 'ex_quant'),
            ('^(every)$', 'univ_quant'),
            ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
            ('^(big|gray|former)$', 'JJ'),
            ('^(him|himself)$', 'PRP'),
        ]
    )

    depparser = MaltParser(tagger=tagger)
    glue = Glue(depparser=depparser, verbose=False)

    for (i, sentence) in enumerate(examples):
        if i == show_example or show_example == -1:
            print('[[[Example %s]]]  %s' % (i, sentence))
            for reading in glue.parse_to_meaning(sentence.split()):
                print(reading.simplify())
            print('')
Beispiel #17
0
 def __init__(self, tagger=None):
     self.config_malt()
     self.mco = 'malt_temp'
     self._trained = False
     
     if tagger is not None:
         self.tagger = tagger
     else:
         self.tagger = RegexpTagger(
         [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
          (r'(The|the|A|a|An|an)$', 'AT'),   # articles
          (r'.*able$', 'JJ'),                # adjectives
          (r'.*ness$', 'NN'),                # nouns formed from adjectives
          (r'.*ly$', 'RB'),                  # adverbs
          (r'.*s$', 'NNS'),                  # plural nouns
          (r'.*ing$', 'VBG'),                # gerunds
          (r'.*ed$', 'VBD'),                 # past tense verbs
          (r'.*', 'NN')                      # nouns (default)
          ])
Beispiel #18
0
def question4():
    #Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]

    train_words = treebank.words()
    init_tagger = RegexpTagger(patterns)

    #Not sure if we need to use BrillTagger or BrillTaggerTrainer??
    #tagger = BrillTagger(init_tagger)
    # tagger = BrillTaggerTrainer(init_tagger)
    return
Beispiel #19
0
def Brill_recursion(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]

    # init_tagger = CRFTagger(feature_func=feature_func)
    # init_tagger.train(train_sentences, 'model.crf.tagger')
    init_tagger = RegexpTagger(patterns)
    currentTagger = None
    current_evaluation = 0.0
    evaluations = []

    for i in range(nb_iterations):
        #Not sure if we need to use BrillTagger or BrillTaggerTrainer??
        #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        if i == 0:
            tt = BrillTaggerTrainer(init_tagger, templates, trace=3)
            currentTagger = tt.train(train_sentences)
            current_evaluation = currentTagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)

        else:
            tt = BrillTaggerTrainer(currentTagger, templates, trace=3)
            tagger = tt.train(train_sentences)
            current_evaluation = tagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)
            currentTagger = tagger

    print(current_evaluation)
    return evaluations
Beispiel #20
0
class MaltParser(ParserI):

    def __init__(self, tagger=None, mco=None, working_dir=None, additional_java_args=None):
        """
        An interface for parsing with the Malt Parser.

        :param mco: The name of the pre-trained model. If provided, training
            will not be required, and MaltParser will use the model file in
            ${working_dir}/${mco}.mco.
        :type mco: str
        """
        self.config_malt()
        self.mco = 'malt_temp' if mco is None else mco
        self.working_dir = tempfile.gettempdir() if working_dir is None\
                           else working_dir
        self.additional_java_args = [] if additional_java_args is None else additional_java_args
        self._trained = mco is not None

        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
             ])

    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the ``malt`` package.  This
        searches for a directory containing the malt jar

        :param bin: The full path to the ``malt`` binary.  If not
            specified, then nltk will search the system for a ``malt``
            binary; and if one is not found, it will raise a
            ``LookupError`` exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by ``config_malt`` when searching
        #: for the malt executables.
        _malt_path = ['.',
                     '/usr/lib/malt-1*',
                     '/usr/share/malt-1*',
                     '/usr/local/bin',
                     '/usr/local/malt-1*',
                     '/usr/local/bin/malt-1*',
                     '/usr/local/malt-1*',
                     '/usr/local/share/malt-1*']

        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary('malt.jar', bin,
            searchpath=malt_path, env_vars=['MALTPARSERHOME'],
            url='http://www.maltparser.org/',
            verbose=verbose)

    def parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        words; it will be automatically tagged with this MaltParser instance's
        tagger.

        :param sentence: Input sentence to parse
        :type sentence: list(str)
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        return self.batch_parse([sentence], verbose)[0]

    def batch_parse(self, sentences, verbose=False):
        """
        Use MaltParser to parse multiple sentence. Takes multiple sentences as a
        list where each sentence is a list of words.
        Each sentence will be automatically tagged with this MaltParser instance's
        tagger.

        :param sentences: Input sentences to parse
        :type sentence: list(list(str))
        :return: list(``DependencyGraph``) the dependency graph representation
                 of each sentence
        """
        tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences]
        return self.tagged_batch_parse(tagged_sentences, verbose)

    def raw_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged with this
        MaltParser instance's tagger.

        :param sentence: Input sentence to parse
        :type sentence: str
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        words = word_tokenize(sentence)
        return self.parse(words, verbose)

    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.

        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        return self.tagged_batch_parse([sentence], verbose)[0]

    def tagged_batch_parse(self, sentences, verbose=False):
        """
        Use MaltParser to parse multiple sentences. Takes multiple sentences
        where each sentence is a list of (word, tag) tuples.
        The sentences must have already been tokenized and tagged.

        :param sentences: Input sentences to parse
        :type sentence: list(list(tuple(str, str)))
        :return: list(``DependencyGraph``) the dependency graph representation
                 of each sentence
        """

        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")
        if not self._trained:
            raise Exception("Parser has not been trained.  Call train() first.")

        input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll',
                                                 dir=self.working_dir,
                                                 delete=False)
        output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll',
                                                 dir=self.working_dir,
                                                 delete=False)

        try:
            for sentence in sentences:
                for (i, (word, tag)) in enumerate(sentence, start=1):
                    input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
                        (i, word, '_', tag, tag, '_', '0', 'a', '_', '_')
                    input_file.write(input_str.encode("utf8"))
                input_file.write(b'\n\n')
            input_file.close()

            cmd = ['java'] + self.additional_java_args + ['-jar', self._malt_bin,
                   '-w', self.working_dir,
                   '-c', self.mco, '-i', input_file.name,
                   '-o', output_file.name, '-m', 'parse']

            ret = self._execute(cmd, verbose)
            if ret != 0:
                raise Exception("MaltParser parsing (%s) failed with exit "
                                "code %d" % (' '.join(cmd), ret))

            return DependencyGraph.load(output_file.name)
        finally:
            input_file.close()
            os.remove(input_file.name)
            output_file.close()
            os.remove(output_file.name)

    def train(self, depgraphs, verbose=False):
        """
        Train MaltParser from a list of ``DependencyGraph`` objects

        :param depgraphs: list of ``DependencyGraph`` objects for training input data
        """
        input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
                                                 dir=self.working_dir,
                                                 delete=False)
        try:
            input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs))
            input_file.write(input_str.encode("utf8"))
            input_file.close()
            self.train_from_file(input_file.name, verbose=verbose)
        finally:
            input_file.close()
            os.remove(input_file.name)

    def train_from_file(self, conll_file, verbose=False):
        """
        Train MaltParser from a file

        :param conll_file: str for the filename of the training input data
        """
        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")

        # If conll_file is a ZipFilePathPointer, then we need to do some extra
        # massaging
        if isinstance(conll_file, ZipFilePathPointer):
            input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
                                                     dir=self.working_dir,
                                                     delete=False)
            try:
                conll_str = conll_file.open().read()
                conll_file.close()
                input_file.write(conll_str)
                input_file.close()
                return self.train_from_file(input_file.name, verbose=verbose)
            finally:
                input_file.close()
                os.remove(input_file.name)

        cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir,
               '-c', self.mco, '-i', conll_file, '-m', 'learn']

        ret = self._execute(cmd, verbose)
        if ret != 0:
            raise Exception("MaltParser training (%s) "
                            "failed with exit code %d" %
                            (' '.join(cmd), ret))

        self._trained = True

    @staticmethod
    def _execute(cmd, verbose=False):
        output = None if verbose else subprocess.PIPE
        p = subprocess.Popen(cmd, stdout=output, stderr=output)
        return p.wait()
Beispiel #21
0
   testcurve = [1 - x/teststats['tokencount'] for x in testcurve[:take]]

   traincurve = [trainstats['initialerrors']]
   for rulescore in trainstats['rulescores']:
       traincurve.append(traincurve[-1] - rulescore)
   traincurve = [1 - x/trainstats['tokencount'] for x in traincurve[:take]]

   import matplotlib.pyplot as plt
   r = list(range(len(testcurve)))
   plt.plot(r, testcurve, r, traincurve)
   plt.axis([None, None, None, 1.0])
   plt.savefig(learning_curve_output)


NN_CD_TAGGER = RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'.*', 'NN')])

REGEXP_TAGGER = RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
     (r'(The|the|A|a|An|an)$', 'AT'),   # articles
     (r'.*able$', 'JJ'),                # adjectives
     (r'.*ness$', 'NN'),                # nouns formed from adjectives
     (r'.*ly$', 'RB'),                  # adverbs
     (r'.*s$', 'NNS'),                  # plural nouns
     (r'.*ing$', 'VBG'),                # gerunds
     (r'.*ed$', 'VBD'),                 # past tense verbs
     (r'.*', 'NN')                      # nouns (default)
])

Beispiel #22
0
dt.retract_sentence('No person dances')
dt.retract_sentence('No person dances', verbose=True)
dt.readings()

# 给段落增加句子,使用“informchk=True”就可以对加入的句子进行信息量检查(即是否增加了新的信息量)
dt.add_sentence('A person dances', informchk=True)
dt.readings()

# discourse模型可以适应语义歧义,筛选出不可接受的读法。
# Glue语义模型被配置为使用覆盖面广泛的Malt依存关系分析器,输入的句子必须已经完成了分词和标注。
# MaltParser()需要去 http://www.maltparser.org/mco/mco.html 下载 MaltParser,
# 然后解压缩到合适的目录下,使用 parser_dirname 来设置目录
from nltk.tag import RegexpTagger

tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'),
                       ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'),
                       ('^(He)$', 'PRP')])
depparser = nltk.MaltParser(
    tagger=tagger,
    parser_dirname='D:\\Users\\Administrator\\Library\\maltparser')

depparser = nltk.MaltParser(
    tagger=tagger,
    parser_dirname='D:\\Users\\Administrator\\Library\\maltparser-1.9.2')
rc = nltk.DrtGlueReadingCommand(depparser=depparser)
dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc)
dt.readings()

# TypeError: 'RegexpTagger' object is not callable
# 估计是版本不匹配造成的
Beispiel #23
0

# regex tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

print rt.evaluate(test_data)
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
Beispiel #24
0
'''

from nltk.tag import RegexpTagger, untag, UnigramTagger, BigramTagger, TrigramTagger, DefaultTagger, AffixTagger, RegexpTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.corpus import brown, treebank, conll2000
from tag_util import backoff_tagger, train_brill_tagger
import pickle

# train_sents = brown.tagged_sents(categories=['news'])[:40000]
# test_sents = brown.tagged_sents(categories=['news']) [40000:50000]
train_sents = conll2000.tagged_sents()
# some regex pattern that will be used for the RegexpTagger
regex_pattern = [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'),
                 (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'),
                 (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'),
                 (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'),
                 (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'mad', 'JJ'),
                 (r'^a$', 'PREP')]

initial_tagger = backoff_tagger(
    train_sents, [AffixTagger, UnigramTagger, BigramTagger, TrigramTagger],
    backoff=RegexpTagger(regex_pattern))

# Training the Brill Tagger
brill_tagger = train_brill_tagger(initial_tagger, train_sents)
#print brill_tagger.evaluate(test_sents)

# Save pickle for the later use
f = open('brill_tagger.pickle', 'wb')
pickle.dump(brill_tagger, f)
f.close()
Beispiel #25
0
class MaltParser(ParserI):
    def __init__(self, tagger=None):
        self.config_malt()
        self.mco = 'malt_temp'
        self._trained = False

        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger([
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN')  # nouns (default)
            ])

    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the ``malt`` package.  This
        searches for a directory containing the malt jar

        :param bin: The full path to the ``malt`` binary.  If not
            specified, then nltk will search the system for a ``malt``
            binary; and if one is not found, it will raise a
            ``LookupError`` exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by ``config_malt`` when searching
        #: for the malt executables.
        _malt_path = [
            '.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin',
            '/usr/local/malt-1*', '/usr/local/bin/malt-1*',
            '/usr/local/malt-1*', '/usr/local/share/malt-1*'
        ]

        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary(
            'malt.jar',
            bin,
            searchpath=malt_path,
            env_vars=['MALTPARSERHOME'],
            url='http://w3.msi.vxu.se/~jha/maltparser/index.html',
            verbose=verbose)

    def parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        words; it will be automatically tagged with this MaltParser instance's
        tagger.

        :param sentence: Input sentence to parse
        :type sentence: list(str)
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        taggedwords = self.tagger.tag(sentence)
        return self.tagged_parse(taggedwords, verbose)

    def raw_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged with this
        MaltParser instance's tagger.

        :param sentence: Input sentence to parse
        :type sentence: str
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        words = word_tokenize(sentence)
        return self.parse(words, verbose)

    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.

        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """

        if not self._malt_bin:
            raise Exception(
                "MaltParser location is not configured.  Call config_malt() first."
            )
        if not self._trained:
            raise Exception(
                "Parser has not been trained.  Call train() first.")

        input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll')
        output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll')

        execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse'
        if not verbose:
            execute_string += ' > ' + os.path.join(tempfile.gettempdir(),
                                                   "malt.out")

        f = None
        try:
            f = open(input_file, 'w')

            for (i, (word, tag)) in enumerate(sentence):
                f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                        (i + 1, word, '_', tag, tag, '_', '0', 'a', '_', '_'))
            f.write('\n')
            f.close()

            cmd = [
                'java',
                '-jar %s' % self._malt_bin,
                '-w %s' % tempfile.gettempdir(),
                '-c %s' % self.mco,
                '-i %s' % input_file,
                '-o %s' % output_file, '-m parse'
            ]

            self._execute(cmd, 'parse', verbose)

            return DependencyGraph.load(output_file)
        finally:
            if f: f.close()

    def train(self, depgraphs, verbose=False):
        """
        Train MaltParser from a list of ``DependencyGraph`` objects

        :param depgraphs: list of ``DependencyGraph`` objects for training input data
        """
        input_file = os.path.join(tempfile.gettempdir(), 'malt_train.conll')

        f = None
        try:
            f = open(input_file, 'w')
            f.write('\n'.join([dg.to_conll(10) for dg in depgraphs]))
        finally:
            if f: f.close()

        self.train_from_file(input_file, verbose=verbose)

    def train_from_file(self, conll_file, verbose=False):
        """
        Train MaltParser from a file

        :param conll_file: str for the filename of the training input data
        """
        if not self._malt_bin:
            raise Exception(
                "MaltParser location is not configured.  Call config_malt() first."
            )

        # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging
        f = None
        if hasattr(conll_file, 'zipfile'):
            zip_conll_file = conll_file
            conll_file = os.path.join(tempfile.gettempdir(),
                                      'malt_train.conll')
            conll_str = zip_conll_file.open().read()
            f = open(conll_file, 'w')
            f.write(conll_str)
            f.close()

        cmd = [
            'java',
            '-jar %s' % self._malt_bin,
            '-w %s' % tempfile.gettempdir(),
            '-c %s' % self.mco,
            '-i %s' % conll_file, '-m learn'
        ]

        #        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
        #                             stderr=subprocess.STDOUT,
        #                             stdin=subprocess.PIPE)
        #        (stdout, stderr) = p.communicate()

        self._execute(cmd, 'train', verbose)

        self._trained = True

    def _execute(self, cmd, type, verbose=False):
        if not verbose:
            temp_dir = os.path.join(tempfile.gettempdir(), '')
            cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' %
                       ((temp_dir, type) * 2))
        malt_exit = os.system(' '.join(cmd))
Beispiel #26
0
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default)
]

for split in splits:
    test_brown, train_brown = train_test_split(correct_brown,
                                               test_size=split[1] / 100,
                                               shuffle=False)
    test_chat, train_chat = train_test_split(correct_chat,
                                             test_size=split[1] / 100,
                                             shuffle=False)

    # brown
    regex_tagger_brown = RegexpTagger(patterns, backoff=default_tagger_brown)
    unigram_tagger_brown = UnigramTagger(train_brown,
                                         backoff=regex_tagger_brown)
    bigram_tagger_brown = BigramTagger(train_brown,
                                       backoff=unigram_tagger_brown)

    print(f"--------- BROWN CORPUS TAGGING {split[0]}/{split[1]}---------\n")
    print(
        f"The BigramTagger accuracy for the Brown Corpus is {round(bigram_tagger_brown.evaluate(test_brown),3)}"
    )
    print(
        f"The UnigramTagger accuracy for the Brown Corpus is {round(unigram_tagger_brown.evaluate(test_brown),3)}"
    )
    print(
        f"The RegexpTagger accuracy for the Brown Corpus is {round(regex_tagger_brown.evaluate(test_brown),3)}"
    )
Beispiel #27
0
tag to each word
"""
from nltk.tag import DefaultTagger
dt = DefaultTagger("NN")
#Accuracy on test data
print(dt.evaluate(test_data))

print(dt.tag(tokens))

from nltk.tag import RegexpTagger
#define regex tag patterns

patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'),
            (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'),
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]
rt = RegexpTagger(patterns)
# accuracy on test data
print(rt.evaluate(test_data))

from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

#testing perfomence of unigram tagger
print(ut.evaluate(test_data))
print(ut.tag(tokens))
 def __init__(self, backoff=None):
     print("Regexp")
     RegexpTagger.__init__(self, patterns, backoff=backoff)
# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)

# Training set
training_data = tagged_data_list[:cutoff]

# Evaluation set
evaulation_data = tagged_data_list[cutoff:development_size]

# print "Data is splitted!"

# Regular expression tagger
nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'),
                             (r'.*', 'NOUN_NOM')])

# Unigram tagger
unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger)
print "Unigram accuracy: "
print unigram_tagger.evaluate(evaulation_data)

# Bigram tagger
bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)
print "Bigram accuracy: "
print bigram_tagger.evaluate(evaulation_data)

# Trigram tagger
trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger)
print "Trigram accuracy: "
print trigram_tagger.evaluate(evaulation_data)
Beispiel #30
0
 def setRegexPatterns(self, regex):
     if regex == "patterns":
         return RegexpTagger(self.patterns)
     elif regex == "modified":
         return RegexpTagger(self.patternsModified)
 regextagger = RegexpTagger([
     # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kmph|km/hr|kms/hr|kilometers/hr|kilometer/hr)$)','KMPH'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(cm|cms|centimeter|centimeters|centimetre|centimetres)$)',
      'CM'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(km|kms|kilometer|kilometers|kilometre|kilometres)$)',
      'KM'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(mm|mms|millimeter|millimeters|millimetre|millimetres)$)',
      'MM'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(m|meter|meters|metre|metres)$)', 'METER'),
     (r"(?i)(^[0-9]+(.[0-9]+)?\s*(ft|feet|foot|\')$)", 'FEET'),
     # (r'(?i)(^[0-9]+(.[0-9]+)?\s*(inch|inches|\")$)','INCH'),
     # (r'(?i)(^[0-9]+(.[0-9]+)?\s*(yard|yards|yd|yds)$)','YARD'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(mile|miles)$)', 'MILE'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(second|seconds|s|sec|secs)$)', 'SEC'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(minute|minutes|mins|min)$)', 'MIN'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(hour|hours|hr|hrs)$)', 'HOUR'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(day|days)$)', 'DAY'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(year|years|yr|yrs)$)', 'YEAR'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(month|months)$)', 'MONTH'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(week|weeks|wk|wks)$)', 'WEEK'),
     (r'(?i)(^[0-9]+(.[0-9]+)?\s*(gram|gramme|gm|gms|g|grams|grammes|gs)$)',
      'GRAM'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kilogram|kilogramme|kg|kilograms|kilogrammes|kgs)$)',
      'KG'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(milligram|milligramme|mg|milligrams|milligrammes|mgs)$)',
      'MG'),
     # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(ton|tons|tonne|tonnes)$)','TON'),
     # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(pounds|pound|lb|lbs)$)','POUND'),
     # (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(pounds|pound|lb|lbs)$)','LITRE'),
     # \(r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(pounds|pound|lb|lbs)$)','GALLON'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(celcius|c|deg.celcius|deg.c)$)',
      'CELCIUS'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(farenheit|f|deg.farenheit|deg.f|degree|deg)$)',
      'FARENHEIT'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kelvin|k|deg.kelvin|deg.k)$)',
      'KELVIN'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(volt|volts|V)$)', 'VOLTS'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(ampere|amperes|A|amps|amp)$)', 'AMPS'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(watt|watts|W)$)', 'WATT'),
     (r'(?i)(^-?[0-9]+(.[0-9]+)?\s*(kilowatt|kilowatts|kW)$)', 'kW'),
     (r'.*', 'OTHER')
 ])
    (r'SOUTH|South|south', 'SOUTH')

    #            ('TICKET','VALIDITY')
]  # add learning loop here for tags

def_tagger = DefaultTagger('NN')
prelim_def_tagger = DefaultTagger(None)

backoff = RegexpTagger(
    [
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'is|was|are|were', 'VBZ'),  # verb to be
        (r'"', 'QT'),  # quote
        (r'.*', 'NN')  # nouns (default)
    ],
    backoff=def_tagger)
cal2 = {v.upper(): k for k, v in enumerate(calendar.month_abbr)}
cal2.update({v: k for k, v in enumerate(calendar.month_abbr)})
cal2.update({v.upper(): k for k, v in enumerate(calendar.month_name)})
cal2.update({v: k for k, v in enumerate(calendar.month_name)})
del cal2[""]  #remove blank string keyupdat
monthModel = {}
monthModel = {k: 'MM' for k, v in cal2.items()}
Beispiel #33
0
 def __init__(self, backoff=None):
     RegexpTagger.__init__(self, patterns, backoff=backoff)
Beispiel #34
0
class MaltParser(ParserI):

    def __init__(self, tagger=None):
        self.config_malt()
        self.mco = 'malt_temp'
        self._trained = False
        
        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
             ])
    
    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the C{malt} package.  This
        searches for a directory containing the malt jar
        
        :param bin: The full path to the C{malt} binary.  If not
            specified, then nltk will search the system for a C{malt}
            binary; and if one is not found, it will raise a
            C{LookupError} exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by L{config_malt} when searching
        #: for the malt executables.
        _malt_path = ['.',
                     '/usr/lib/malt-1*',
                     '/usr/share/malt-1*',
                     '/usr/local/bin',
                     '/usr/local/malt-1*',
                     '/usr/local/bin/malt-1*',
                     '/usr/local/malt-1*',
                     '/usr/local/share/malt-1*']
        
        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary('malt.jar', bin,
            searchpath=malt_path, env_vars=['MALTPARSERHOME'],
            url='http://w3.msi.vxu.se/~jha/maltparser/index.html',
            verbose=verbose)

    def parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        words; it will be automatically tagged with this MaltParser instance's
        tagger.
        
        :param sentence: Input sentence to parse
        :type sentence: L{list} of L{string}
        :return: C{DependencyGraph} the dependency graph representation of the sentence
        """
        taggedwords = self.tagger.tag(sentence)
        return self.tagged_parse(taggedwords, verbose)

    def raw_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged with this
        MaltParser instance's tagger.
        
        :param sentence: Input sentence to parse
        :type sentence: L{string}
        :return: C{DependencyGraph} the dependency graph representation of the sentence
        """
        words = word_tokenize(sentence)
        return self.parse(words, verbose)
      
    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.
        
        :param sentence: Input sentence to parse
        :type sentence: L{list} of (word, tag) L{tuple}s.
        :return: C{DependencyGraph} the dependency graph representation of the sentence
        """

        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")
        if not self._trained:
            raise Exception("Parser has not been trained.  Call train() first.")
            
        input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll')
        output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll')
        
        execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse'
        if not verbose:
            execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out")
        
        f = None
        try:
            f = open(input_file, 'w')

            for (i, (word,tag)) in enumerate(sentence):
                f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % 
                        (i+1, word, '_', tag, tag, '_', '0', 'a', '_', '_'))
            f.write('\n')
            f.close()
        
            cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), 
                   '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse']

            self._execute(cmd, 'parse', verbose)
            
            return DependencyGraph.load(output_file)
        finally:
            if f: f.close()
    
    def train(self, depgraphs, verbose=False):
        """
        Train MaltParser from a list of C{DependencyGraph}s
        
        :param depgraphs: list of C{DependencyGraph}s for training input data
        """
        input_file = os.path.join(tempfile.gettempdir(),'malt_train.conll')

        f = None
        try:
            f = open(input_file, 'w')
            f.write('\n'.join([dg.to_conll(10) for dg in depgraphs]))
        finally:
            if f: f.close()
            
        self.train_from_file(input_file, verbose=verbose)

    def train_from_file(self, conll_file, verbose=False):
        """
        Train MaltParser from a file
        
        :param conll_file: str for the filename of the training input data
        """
        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")

        # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging
        f = None
        if hasattr(conll_file, 'zipfile'):
            zip_conll_file = conll_file
            conll_file = os.path.join(tempfile.gettempdir(),'malt_train.conll')
            conll_str = zip_conll_file.open().read()
            f = open(conll_file,'w')
            f.write(conll_str)
            f.close()        

        cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), 
               '-c %s' % self.mco, '-i %s' % conll_file, '-m learn']
        
#        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
#                             stderr=subprocess.STDOUT,
#                             stdin=subprocess.PIPE)
#        (stdout, stderr) = p.communicate()
                
        self._execute(cmd, 'train', verbose)
        
        self._trained = True
        
    def _execute(self, cmd, type, verbose=False):
        if not verbose: 
            temp_dir = os.path.join(tempfile.gettempdir(), '')
            cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' % ((temp_dir, type)*2))
        malt_exit = os.system(' '.join(cmd))
Beispiel #35
0
from nltk.tag import RegexpTagger
from tag_util import patterns, test_sents


tagger = RegexpTagger(patterns)
print(tagger.evaluate(test_sents))
Beispiel #36
0
    testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]]

    traincurve = [trainstats["initialerrors"]]
    for rulescore in trainstats["rulescores"]:
        traincurve.append(traincurve[-1] - rulescore)
    traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]]

    import matplotlib.pyplot as plt

    r = list(range(len(testcurve)))
    plt.plot(r, testcurve, r, traincurve)
    plt.axis([None, None, None, 1.0])
    plt.savefig(learning_curve_output)


NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")])

REGEXP_TAGGER = RegexpTagger([
    (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
    (r"(The|the|A|a|An|an)$", "AT"),  # articles
    (r".*able$", "JJ"),  # adjectives
    (r".*ness$", "NN"),  # nouns formed from adjectives
    (r".*ly$", "RB"),  # adverbs
    (r".*s$", "NNS"),  # plural nouns
    (r".*ing$", "VBG"),  # gerunds
    (r".*ed$", "VBD"),  # past tense verbs
    (r".*", "NN"),  # nouns (default)
])


def corpus_size(seqs):
Beispiel #37
0
#    Assigns tags to tokens by comparing their word strings to a series of regular expressions

# Define regex patterns used that determine the tags of tokens. Note that when tagging a token, expressions
# are evaluated bottom up and thus, the last one defines the default tag
patterns = [
    (r".*ing$", "VBG"),  # Gerunds
    (r".*ed$", "VBD"),  # Simple past
    (r".*es$", "VBZ"),  # 3rd singular present
    (r".*ould$", "MD"),  # Modals
    (r".*'s$", "NN$"),  # Possesive pronouns
    (r".*s$", "NNS"),  # Plural nouns
    (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # Cardinal numbers
    (r".*", "NN")  # Nouns (default)
]

rt = RegexpTagger(regexps=patterns)

print(rt.evaluate(test_data))
print(rt.tag(tokens))

# 3. N-GRAM TAGGERS:
#    Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes,
#    letters, characters or syllabes. Shingles: n-grams where items are just words.
#    UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger

# Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations)
ut = UnigramTagger(train=train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# Test the performance of each N-Gram tagger