def setUp(self): if not exists(self.LEXICON_FILE_NAME): self.skipTest("Unable to find file {} as lexicon".format( self.LEXICON_FILE_NAME)) if not exists(self.GRAMMAR_FILE_NAME): self.skipTest("Unable to find file {} as grammar".format( self.GRAMMAR_FILE_NAME)) assert exists(self.PARSE_TREES_FILE_NAME) valid,lexiconText = q1utils.sanitizeAndValidateLexicon( self.LEXICON_FILE_NAME) if not valid: self.skipTest("Lexicon {} is invalid.".format( self.LEXICON_FILE_NAME)) valid,grammarText = q1utils.sanitizeAndValidateGrammar( self.GRAMMAR_FILE_NAME) if not valid: self.skipTest("Grammar {} is invalid.".format( self.GRAMMAR_FILE_NAME)) allRules = grammarText + '\n' + lexiconText try: grammar = CFG.fromstring(allRules) self._parser = BottomUpChartParser(grammar) except Exception as e: self.skipTest(str(e))
class A2Q1GeneratedUnitTests(unittest.TestCase): LEXICON_FILE_NAME = 'Lexicon' GRAMMAR_FILE_NAME = 'Grammar' PARSE_TREES_FILE_NAME = 'ParseTrees' @classmethod def setUpClass(cls): assert(hasattr(cls,'PREAMBLE')) # clear the old parse trees file. It should start with # the preamble specified later on in the generated section with open(cls.PARSE_TREES_FILE_NAME,'w') as f: f.write(cls.PREAMBLE) f.write('\n') def setUp(self): if not exists(self.LEXICON_FILE_NAME): self.skipTest("Unable to find file {} as lexicon".format( self.LEXICON_FILE_NAME)) if not exists(self.GRAMMAR_FILE_NAME): self.skipTest("Unable to find file {} as grammar".format( self.GRAMMAR_FILE_NAME)) assert exists(self.PARSE_TREES_FILE_NAME) valid,lexiconText = q1utils.sanitizeAndValidateLexicon( self.LEXICON_FILE_NAME) if not valid: self.skipTest("Lexicon {} is invalid.".format( self.LEXICON_FILE_NAME)) valid,grammarText = q1utils.sanitizeAndValidateGrammar( self.GRAMMAR_FILE_NAME) if not valid: self.skipTest("Grammar {} is invalid.".format( self.GRAMMAR_FILE_NAME)) allRules = grammarText + '\n' + lexiconText try: grammar = CFG.fromstring(allRules) self._parser = BottomUpChartParser(grammar) except Exception as e: self.skipTest(str(e)) def _runSentenceHelper(self,sentence): # archetype method for parsing sentences trees = self._parser.parse(word_tokenize(sentence)) with open(self.PARSE_TREES_FILE_NAME,'a') as f: f.write(sentence) f.write('\n') for tree in trees: f.write(str(tree)) f.write('\n') if not trees: f.write('No parses') f.write('\n\n') return bool(trees)
class A2Q1GeneratedUnitTests(unittest.TestCase): LEXICON_FILE_NAME = 'Lexicon' GRAMMAR_FILE_NAME = 'Grammar' PARSE_TREES_FILE_NAME = 'ParseTrees' @classmethod def setUpClass(cls): assert(hasattr(cls,'PREAMBLE')) with open(cls.PARSE_TREES_FILE_NAME,'w') as f: f.write(cls.PREAMBLE) f.write('\n') def setUp(self): if not exists(self.LEXICON_FILE_NAME): self.skipTest("Unable to find file {} as lexicon".format( self.LEXICON_FILE_NAME)) if not exists(self.GRAMMAR_FILE_NAME): self.skipTest("Unable to find file {} as grammar".format( self.GRAMMAR_FILE_NAME)) assert exists(self.PARSE_TREES_FILE_NAME) valid,lexiconText = q1utils.sanitizeAndValidateLexicon( self.LEXICON_FILE_NAME) if not valid: self.skipTest("Lexicon {} is invalid.".format( self.LEXICON_FILE_NAME)) valid,grammarText = q1utils.sanitizeAndValidateGrammar( self.GRAMMAR_FILE_NAME) if not valid: self.skipTest("Grammar {} is invalid.".format( self.GRAMMAR_FILE_NAME)) allRules = grammarText + '\n' + lexiconText try: grammar = CFG.fromstring(allRules) self._parser = BottomUpChartParser(grammar) except Exception as e: self.skipTest(str(e)) def _runSentenceHelper(self,sentence): trees = self._parser.parse(word_tokenize(sentence)) with open(self.PARSE_TREES_FILE_NAME,'a') as f: f.write(sentence) f.write('\n') for tree in trees: f.write(str(tree)) f.write('\n') if not trees: f.write('No parses') f.write('\n\n') return bool(trees) PREAMBLE = '%Sam Earle,c2earles,999228438' def test_parseTheStudentPreferredToSleep(self): self.assertTrue(self._runSentenceHelper('the student preferred to sleep'), 'Could not parse "the student preferred to sleep"') def test_parseTheStudentPersuadedTheTeacherToSleep(self): self.assertTrue(self._runSentenceHelper('the student persuaded the teacher to sleep'), 'Could not parse "the student persuaded the teacher to sleep"') def test_parseTheStudentPromisedTheTeacherToSleep(self): self.assertTrue(self._runSentenceHelper('the student promised the teacher to sleep'), 'Could not parse "the student promised the teacher to sleep"') def test_parseTheStudentExpectedTheTeacherToSleep(self): self.assertTrue(self._runSentenceHelper('the student expected the teacher to sleep'), 'Could not parse "the student expected the teacher to sleep"') def test_parseTheTeacherExpectedToSleep(self): self.assertTrue(self._runSentenceHelper('the teacher expected to sleep'), 'Could not parse "the teacher expected to sleep"') def test_parseTheTeacherPersuadedTheStudent(self): self.assertTrue(self._runSentenceHelper('the teacher persuaded the student'), 'Could not parse "the teacher persuaded the student"') def test_parseTheTeacherPromisedToSleep(self): self.assertTrue(self._runSentenceHelper('the teacher promised to sleep'), 'Could not parse "the teacher promised to sleep"') def test_parseTheStudentPreferredTheTeacher(self): self.assertTrue(self._runSentenceHelper('the student preferred the teacher'), 'Could not parse "the student preferred the teacher"')
def parse_date_text(text): """Parses a date from some text, returns (TimelineDate, index) where text[:index] is the text determined to be the date. If no date can be found, returns None. Assumes that date is at the beginning of the string with no superfluous characters. """ text = text.lower() date_parser = BottomUpChartParser(parse_cfg(date_grammar_string)) parses = [] for date_text in _possible_texts(text): parses = date_parser.nbest_parse(date_text) if parses: break if not parses: return None # these are all very closely tied to date_grammar def numstr(nume): # returns string of digits return ''.join(l for l in nume.leaves() if l.isdigit()) def month(month): # returns TimePoint return TimePoint(month = month_to_num(month[0].node)) def monthday(monthday): # returns TimePoint d = None for n in monthday: if hasattr(n, 'node'): if n.node == 'DAY': d = int(numstr(n)) elif n.node == 'MONTH': m = month(n) elif n.node == 'MONTHNUM': # only one of MONTH and MONTHNUM should be present m = TimePoint(month = int(numstr(n))) m.day = d return m def num(num): # returns TimePoint if num[0].node == 'NUME': return TimePoint(int(numstr(num[0]))) elif num[0].node == 'NUMQ': return TimePoint( [int(numstr(n)) for n in num[0] if n.node == 'NUME'][0], year_approx = True) def dece(dece): # returns number if len(dece) == 1: return int(numstr(dece[0])) else: return float(numstr(dece[0]) + '.' + numstr(dece[2])) def dec(dec): # returns TimePoint if dec[0].node == 'DECE': return TimePoint(dece(dec[0])) elif dec[0].node == 'DECQ': return TimePoint([dece(n) for n in dec[0] if n.node == 'DECE'][0], year_approx = True) elif dec[0].node == 'DECQQ': return TimePoint(dece(dec[0][0]), year_approx = dece(dec[0][4])) def period(period): # returns TimelineDate isad = period.node == 'PERIODAD' n = num(period[0][0][0]) if period[0][2].node == 'century': factor = 100 elif period[0][2].node == 'millenium': factor = 1000 if isad: return TimelineDate(n * factor - factor, n * factor) else: return TimelineDate(-n * factor + 1, -n * factor + factor + 1) def yadyymymd(yad): # returns TimePoint # name stands for year AD: year, year month, year month day monthtp = None daynum = None for s in yad.subtrees(): if s.node == 'NUM': yeartp = num(s) elif s.node == 'YADPRECISEYEAR': # there should never be a NUM and a YADPRECISEYEAR yeartp = TimePoint(int(numstr(s))) elif s.node == 'MONTH': monthtp = month(s) elif s.node == 'DAY': daynum = int(numstr(s)) if monthtp: yeartp.month = monthtp.month if daynum != None: yeartp.day = daynum return yeartp def year(year): # returns TimePoint if year.node == 'YBC': return -num(year[0]) + 1 # because we are using astronomical years in which x BC is stored as (-x + 1) elif year.node == 'YAD': return yadyymymd(year) def _has_child_node(n, label): return [i for i, c in enumerate(n) if hasattr(c, 'node') and c.node == label] def daterange(r): # returns TimelineDate if r[0][0].node == 'YAD' and r[2][0].node == 'YAD': # for cases like 1832-34. gets parsed as YAD to YAD, but we need # to modify the second node first = date(r[0]) second = date(r[2]) first_str = str(first.start.year) second_str = str(second.start.year) if len(second_str) < len(first_str): second.start.year = int( first_str[:len(first_str) - len(second_str)] + second_str) elif r[0].node == 'DATE' and r[0][0].node != 'YAD' and r[0][0].node != 'PERIODAD': # these cases should work without any modification to either node first = date(r[0]) second = date(r[2]) else: # for cases like 34-12 b.c. or 12 century to 10th century bc. Gets # parsed as YAD to YBC or NUM to YBC. replacement_node finds the # '34' in the AST for YAD, and puts it in the corresponding # location for a copy of the YBC ast. This gets us a fully # qualified date for '34' that we can use to create the range if r[0][0].node == 'YAD': replacement_node = r[0][0].subtrees(filter = lambda x: x.node == 'NUM').next() elif r[0][0].node == 'PERIODAD': replacement_node = r[0][0][0][0] elif r[0].node == 'ORD': replacement_node = r[0] else: raise RuntimeError('unexpected node type in date AST') # copy the second date's AST, replace, and get the date first_mock = r[2].copy(True) #deepcopy parent = next(first_mock.subtrees(lambda t: _has_child_node(t, replacement_node.node))) parent[_has_child_node(parent, replacement_node.node)[0]] = replacement_node first = date(first_mock) second = date(r[2]) return TimelineDate.span_from_dates(first, second) def date(date): # returns TimelineDate if date[0].node == 'YAD' or date[0].node == 'YBC': return TimelineDate(year(date[0])) elif date[0].node == 'PERIODAD' or date[0].node == 'PERIODBC': return period(date[0]) def yearsago(yearsago): # returns TimelineDate # assume years ago means years ago from Jan 1 1950 if yearsago[0].node == 'YAS': return TimelineDate(-num(yearsago[0][0]) + 1950) elif yearsago[0].node == 'YAR': return TimelineDate(-num(yearsago[0][0]) + 1950, -num(yearsago[0][2]) + 1950) elif yearsago[0].node == 'KAS': return TimelineDate(-dec(yearsago[0][0]) * 1000 + 1950) elif yearsago[0].node == 'KAR': return TimelineDate(-dec(yearsago[0][0]) * 1000 + 1950, -dec(yearsago[0][2]) * 1000 + 1950) elif yearsago[0].node == 'MAS': return TimelineDate(-dec(yearsago[0][0]) * 1000000 + 1950) elif yearsago[0].node == 'MAR': return TimelineDate(-dec(yearsago[0][0]) * 1000000 + 1950, -dec(yearsago[0][2]) * 1000000 + 1950) def monthdayrange(r): # returns TimelineDate copy_from_first = False second = None if r[2].node == 'DAY': second = TimePoint(day = int(numstr(r[2]))) copy_from_first = True elif r[2].node == 'MONTHDAY': second = monthday(r[2]) elif r[2].node == 'YADYEARMONTH' or r[2].node == 'YADYEARMONTHDAY': second = yadyymymd(r[2]) if r[0].node == 'DAY': first = TimePoint(second.year, second.month, int(numstr(r[0])), year_approx = second.year_approx) elif r[0].node == 'MONTH': first = TimePoint(second.year, month(r[0]).month, year_approx = second.year_approx) elif r[0].node == 'MONTHDAY': temp = monthday(r[0]) first = TimePoint(second.year, temp.month, temp.day, year_approx = second.year_approx) if copy_from_first: second.year = first.year second.month = first.month return TimelineDate(first, second) def monthdayyearrange(r): # returns TimelineDate yeartp = TimePoint(int(numstr(r[4]))) monthdaytp = monthday(r[0]) return TimelineDate( TimePoint(yeartp.year, monthdaytp.month, monthdaytp.day, year_approx = yeartp.year_approx), TimePoint(yeartp.year, monthdaytp.month, int(numstr(r[2])), year_approx = yeartp.year_approx)) def timename(timename): # returns TimelineDate if timename[0].node == 'antiquity': return TimelineDate(TimePoint(-750), TimePoint(450)) parse = None if len(parses) > 1: # ambiguous parses will fall into 3 categories # 1. DATE/MONTHDAY ambiguity # for a string like December 3: # MONTHDAY (prefer) # DATE -> YAD -> YADYEARMONTH # or 3 December # # 2. DATERANGE/MONTHDAYRANGE ambiguity # for a string like 6 June - 3 October 2013 # MONTHDAYRANGE -> MONTHDAY TO DATE (prefer) # DATERANGE -> DATE TO DATE -> YAD TO YAD -> YADYEARMONTH TO YADYEARMONTHDAY # (this is almost the same thing as problem 1) # # The other category is something like 3 December 4 # YADYEARMONTHDAY -> MONTHDAY ocommadotsp YADYEAR (prefer) # YADYEARMONTHDAY -> YADYEAR ocommadotsp MONTHDAY # this should be extremely rare, and we will not bother dealing with these issues # this can also happen in DATERANGE temp = [p for p in parses if p[0].node == 'MONTHDAY'] if len(temp) == 1: parse = temp[0] if not parse: temp = [p for p in parses if p[0].node == 'MONTHDAYRANGE'] if len(temp) == 1: parse = temp[0] if not parse: warnings.warn('not sure how to decide between multiple parses %s' % date_text) parse = parses[0] else: parse = parses[0] if parse[0].node == 'DATE': result = date(parse[0]) elif parse[0].node == 'YEARSAGO': result = yearsago(parse[0]) elif parse[0].node == 'DATERANGE': result = daterange(parse[0]) elif parse[0].node == 'MONTH': result = TimelineDate(month(parse[0])) elif parse[0].node == 'MONTHDAY': result = TimelineDate(monthday(parse[0])) elif parse[0].node == 'MONTHDAYRANGE': result = monthdayrange(parse[0]) elif parse[0].node == 'MONTHDAYYEARRANGE': result = monthdayyearrange(parse[0]) elif parse[0].node == 'TIMENAME': result = timename(parse[0]) return (result, len(date_text))