Example #1
0
 def testNoApply(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><February~.+><(\d{4})~.+>',
                              'date', 'testNoApply', r'{#2} + "01" + {#1}')
     t = Timex(type='date')
     self.assertFalse(
         rule.apply(t, '', '', [('06', 'POS', set([t])),
                                ('th', 'POS', set([t])),
                                ('January', 'POS', set([t])),
                                ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.value, None)
Example #2
0
 def testApplyCorrectType(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                              'date', 'testApplyCorrectType',
                              r'{#2} + "01" + {#1}')
     t = Timex(type='time')
     self.assertFalse(
         rule.apply(t, '', '', [('06', 'POS', set([t])),
                                ('th', 'POS', set([t])),
                                ('January', 'POS', set([t])),
                                ('1996', 'POS', set([t]))], [], [])[0])
 def testRaiseError(self):
     rules = [
         NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                           'date', 'testRaiseError1',
                           r'{#2} + "01" + {#1}'),
         NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                           'date', 'testRaiseError2', r'{#2} + "02" + {#1}')
     ]
     self.assertRaises(RuleLoadError, NormalisationRuleBlock, None, [],
                       'invalid', rules)
Example #4
0
 def testApplyInsensitive(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><january~.+><(\d{4})~.+>',
                              'date', 'testApplyInsensitive',
                              r'{#2} + "01" + {#1}')
     t = Timex(type='date')
     self.assertTrue(
         rule.apply(t, '', '', [('06', 'POS', set([t])),
                                ('th', 'POS', set([t])),
                                ('January', 'POS', set([t])),
                                ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.value, '19960106')
Example #5
0
 def testApplyFreq(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                              'date',
                              'testApplyFreq',
                              freq=r'"1D"')
     t = Timex(type='date')
     self.assertTrue(
         rule.apply(t, '', '', [('06', 'POS', set([t])),
                                ('th', 'POS', set([t])),
                                ('January', 'POS', set([t])),
                                ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.freq, '1D')
Example #6
0
 def testApplyQuant(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                              'date',
                              'testApplyQuant',
                              quant=r'"EVERY"')
     t = Timex(type='date')
     self.assertTrue(
         rule.apply(t, '', '', [('06', 'POS', set([t])),
                                ('th', 'POS', set([t])),
                                ('January', 'POS', set([t])),
                                ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.quant, 'EVERY')
Example #7
0
 def testApplyChangeType(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                              'date',
                              'testApplyChangeType',
                              change_type=r'"non-date"')
     t = Timex(type='date')
     self.assertTrue(
         rule.apply(t, '', '', [('06', 'POS', set([t])),
                                ('th', 'POS', set([t])),
                                ('January', 'POS', set([t])),
                                ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.type, 'non-date')
 def testApplyAll(self):
     rules = [
         NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                           'date', 'testApplyAll1', r'{#2} + "01" + {#1}'),
         NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                           'date', 'testApplyAll2', r'{#2} + "02" + {#1}')
     ]
     b = NormalisationRuleBlock(None, [], 'all', rules)
     t = Timex(type='date')
     self.assertTrue(
         b.apply(t, '', '', [('06', 'POS', {t}), ('th', 'POS', {t}),
                             ('January', 'POS', {t}), ('1996', 'POS', {t})],
                 [], [])[0])
     self.assertEquals(t.value, '19960206')
 def testApplyUntilSuccess2(self):
     rules = [
         NormalisationRule(r'<(\d+)~.+><th~.+><February~.+><(\d{4})~.+>',
                           'date', 'testApplyUntilSuccess2A',
                           r'{#2} + "02" + {#1}'),
         NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                           'date', 'testApplyUntilSuccess2B',
                           r'{#2} + "01" + {#1}')
     ]
     b = NormalisationRuleBlock(None, [], 'until-success', rules)
     t = Timex(type='date')
     self.assertTrue(
         b.apply(t, '', '', [('06', 'POS', {t}), ('th', 'POS', {t}),
                             ('January', 'POS', {t}), ('1996', 'POS', {t})],
                 [], [])[0])
     self.assertEquals(t.value, '19960106')
Example #10
0
 def testNegAfterBlocks(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                              'date',
                              'testNegAfterBlocks',
                              r'{#2} + "01" + {#1}',
                              after_guards=[r'!<to~.+><Atlanta~.+>'])
     t = Timex(type='date')
     (before, body, after) = ([('We', 'POS', set()), ('took', 'POS', set()),
                               ('a', 'POS', set()), ('plane', 'POS', set()),
                               ('on', 'POS', set()),
                               ('the', 'POS', set())], [
                                   ('06', 'POS', set()),
                                   ('th', 'POS', set()),
                                   ('January', 'POS', set()),
                                   ('1996', 'POS', set())
                               ], [('to', 'POS', set()),
                                   ('Atlanta', 'POS', set())])
     self.assertFalse(rule.apply(t, '', '', body, before, after)[0])
 def testPosGuardBlocks(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testPosGuardBlocks', r'{#2} + "01" + {#1}',
                               guards = [r'<th~.+><February~.+>'])
     t = Timex(type='date')
     (before, body, after) = (
         [('We', 'POS', set()),
          ('took', 'POS', set()),
          ('a', 'POS', set()),
          ('plane', 'POS', set()),
          ('on', 'POS', set()),
          ('the', 'POS', set())],
         [('06', 'POS', set()),
          ('th', 'POS', set()),
          ('January', 'POS', set()),
          ('1996', 'POS', set())],
         [('to', 'POS', set()),
          ('Atlanta', 'POS', set())]
     )
     self.assertFalse(rule.apply(t, '', '', body, before, after)[0])
Example #12
0
 def testPosGuardAllows(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>',
                              'date',
                              'testPosGuardAllows',
                              r'{#2} + "01" + {#1}',
                              guards=[r'<th~.+><January~.+>'])
     t = Timex(type='date')
     (before, body, after) = ([('We', 'POS', set()), ('took', 'POS', set()),
                               ('a', 'POS', set()), ('plane', 'POS', set()),
                               ('on', 'POS', set()),
                               ('the', 'POS', set())], [
                                   ('06', 'POS', set()),
                                   ('th', 'POS', set()),
                                   ('January', 'POS', set()),
                                   ('1996', 'POS', set())
                               ], [('to', 'POS', set()),
                                   ('Atlanta', 'POS', set())])
     self.assertTrue(rule.apply(t, '', '', body, before, after)[0])
     self.assertEquals(t.value, '19960106')
 def testNegAfterAllows(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testNegAfterAllows', r'{#2} + "01" + {#1}',
                               after_guards = [r'!<a~.+><plane~.+>'])
     t = Timex(type='date')
     (before, body, after) = (
         [('We', 'POS', set()),
          ('took', 'POS', set()),
          ('a', 'POS', set()),
          ('plane', 'POS', set()),
          ('on', 'POS', set()),
          ('the', 'POS', set())],
         [('06', 'POS', set()),
          ('th', 'POS', set()),
          ('January', 'POS', set()),
          ('1996', 'POS', set())],
         [('to', 'POS', set()),
          ('Atlanta', 'POS', set())]
     )
     self.assertTrue(rule.apply(t, '', '', body, before, after)[0])
     self.assertEquals(t.value, '19960106')
    def _load_rule(self, filename, rulelines):
        """
        Load a 'simple' normalisation rule
        """

        # get key/value dictionaries
        d = self._parse_rule(filename, rulelines)

        # Set defaults
        type = None
        match = None
        id = filename
        value = None
        guards = []
        before_guards = []
        after_guards = []
        sent_guards = []
        after = []
        tokenise = True
        deliminate_numbers = False
        change_type = None
        freq = None
        quant = None
        mod = None

        for key in d:
            # Only one 'Type' field allowed
            if key == 'type':
                if len(d[key]) != 1:
                    raise RuleLoadError(filename, "Too many 'Type' field")
                else:
                    type = d[key][0]

            # Only one 'Match' field allowed
            elif key == 'match':
                if len(d[key]) != 1:
                    raise RuleLoadError(filename, "There must be exactly 1 'Match' field")
                else:
                    match = d[key][0]

            # No more than one ID key allowed
            elif key == 'id':
                if len(d[key]) == 1:
                    id = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'ID' fields")

            # No more than one Value key allowed
            elif key == 'value':
                if len(d[key]) == 1:
                    value = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Value' fields")

            # No more than one Change-Type key allowed
            elif key == 'change-type':
                if len(d[key]) == 1:
                    change_type = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Change-Type' fields")

            # No more than one Freq key allowed
            elif key == 'freq':
                if len(d[key]) == 1:
                    freq = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Freq' fields")

            # No more than one Quant key allowed
            elif key == 'quant':
                if len(d[key]) == 1:
                    quant = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Quant' fields")

            # No more than one Mod key allowed
            elif key == 'mod':
                if len(d[key]) == 1:
                    mod = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Mod' fields")

            # set optional fields
            elif key == 'guard':
                guards = d[key]
            elif key == 'after':
                after = d[key]
            elif key == 'before-guard':
                before_guards = d[key]
            elif key == 'after-guard':
                after_guards = d[key]
            elif key == 'sent-guard':
                sent_guards = d[key]

            elif key == 'tokenise':
                if len(d[key]) == 1:
                    tokenise = d[key][0].lower()
                    if tokenise == 'true':
                        tokenise = True
                    elif tokenise == 'space':
                        tokenise = ' '
                    elif tokenise == 'null':
                        tokenise = ''
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Tokenise' fields")

            # Deliminate-Numbers is an optional field, defaulting to False, which
            # accepts either true or false (case-insensitive) as values
            elif key == 'deliminate-numbers':
                if len(d[key]) == 1:
                    deliminate_numbers = d[key][0].lower()
                    if deliminate_numbers == 'true':
                        deliminate_numbers = True
                    elif deliminate_numbers == 'false':
                        deliminate_numbers = False
                    else:
                        raise RuleLoadError(filename, "Deliminate-Numbers must be either 'True' or 'False'")
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Deliminate-Numbers' fields")

            # error on unknown fields
            else:
                raise RuleLoadError(filename, "Unknown field '" + key + "'")

        if match is None:
            raise RuleLoadError(filename, "'Match' is a compulsory field")

        if deliminate_numbers and not tokenise:
            raise RuleLoadError(filename, "'Deliminate-Numbers' can not be set if Tokenise is")

        # Guard against any RE errors
        try:
            return NormalisationRule(match, type, id, value, change_type, freq, quant, mod, guards, after_guards,
                before_guards, sent_guards, after, tokenise, deliminate_numbers)
        except re.error as e:
            raise RuleLoadError(filename, "Malformed regular expression: " + str(e))
 def testApplyChangeType(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyChangeType', change_type=r'"non-date"')
     t = Timex(type='date')
     self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.type, 'non-date')
 def testApplyFreq(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyFreq', freq=r'"1D"')
     t = Timex(type='date')
     self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.freq, '1D')
 def testApplyQuant(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyQuant', quant=r'"EVERY"')
     t = Timex(type='date')
     self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.quant, 'EVERY')
 def testApplyCorrectType(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyCorrectType', r'{#2} + "01" + {#1}')
     t = Timex(type='time')
     self.assertFalse(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0])
 def testApplyValue(self):
     rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyValue', r'{#2} + "01" + {#1}')
     t = Timex(type='date')
     self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0])
     self.assertEquals(t.value, '19960106')