Ejemplo n.º 1
0
 def setUp(self):
     if not exists(self.LEXICON_FILE_NAME):
         self.skipTest("Unable to find file {} as lexicon".format(
             self.LEXICON_FILE_NAME))
     if not exists(self.GRAMMAR_FILE_NAME):
         self.skipTest("Unable to find file {} as grammar".format(
             self.GRAMMAR_FILE_NAME))
     assert exists(self.PARSE_TREES_FILE_NAME)
     
     valid,lexiconText = q1utils.sanitizeAndValidateLexicon(
         self.LEXICON_FILE_NAME)
     if not valid:
         self.skipTest("Lexicon {} is invalid.".format(
             self.LEXICON_FILE_NAME))
     
     valid,grammarText = q1utils.sanitizeAndValidateGrammar(
         self.GRAMMAR_FILE_NAME)
     if not valid:
         self.skipTest("Grammar {} is invalid.".format(
             self.GRAMMAR_FILE_NAME))
     
     allRules = grammarText + '\n' + lexiconText
     
     try:
         grammar = CFG.fromstring(allRules)
         self._parser = BottomUpChartParser(grammar)
     except Exception as e:
         self.skipTest(str(e))
Ejemplo n.º 2
0
class A2Q1GeneratedUnitTests(unittest.TestCase):
    
    LEXICON_FILE_NAME = 'Lexicon'
    GRAMMAR_FILE_NAME = 'Grammar'
    PARSE_TREES_FILE_NAME = 'ParseTrees'
    
    @classmethod
    def setUpClass(cls):
        assert(hasattr(cls,'PREAMBLE'))
        
        # clear the old parse trees file. It should start with
        # the preamble specified later on in the generated section
        with open(cls.PARSE_TREES_FILE_NAME,'w') as f:
            f.write(cls.PREAMBLE)
            f.write('\n')
    
    def setUp(self):
        if not exists(self.LEXICON_FILE_NAME):
            self.skipTest("Unable to find file {} as lexicon".format(
                self.LEXICON_FILE_NAME))
        if not exists(self.GRAMMAR_FILE_NAME):
            self.skipTest("Unable to find file {} as grammar".format(
                self.GRAMMAR_FILE_NAME))
        assert exists(self.PARSE_TREES_FILE_NAME)
        
        valid,lexiconText = q1utils.sanitizeAndValidateLexicon(
            self.LEXICON_FILE_NAME)
        if not valid:
            self.skipTest("Lexicon {} is invalid.".format(
                self.LEXICON_FILE_NAME))
        
        valid,grammarText = q1utils.sanitizeAndValidateGrammar(
            self.GRAMMAR_FILE_NAME)
        if not valid:
            self.skipTest("Grammar {} is invalid.".format(
                self.GRAMMAR_FILE_NAME))
        
        allRules = grammarText + '\n' + lexiconText
        
        try:
            grammar = CFG.fromstring(allRules)
            self._parser = BottomUpChartParser(grammar)
        except Exception as e:
            self.skipTest(str(e))
        
    
    def _runSentenceHelper(self,sentence):
        # archetype method for parsing sentences
        trees = self._parser.parse(word_tokenize(sentence))
        with open(self.PARSE_TREES_FILE_NAME,'a') as f:
            f.write(sentence)
            f.write('\n')
            for tree in trees:
                f.write(str(tree))
                f.write('\n')
            if not trees:
                f.write('No parses')
            f.write('\n\n')
        
        return bool(trees)
Ejemplo n.º 3
0
class A2Q1GeneratedUnitTests(unittest.TestCase):
    LEXICON_FILE_NAME = 'Lexicon'
    GRAMMAR_FILE_NAME = 'Grammar'
    PARSE_TREES_FILE_NAME = 'ParseTrees'
    @classmethod
    def setUpClass(cls):
        assert(hasattr(cls,'PREAMBLE'))
        with open(cls.PARSE_TREES_FILE_NAME,'w') as f:
            f.write(cls.PREAMBLE)
            f.write('\n')
    def setUp(self):
        if not exists(self.LEXICON_FILE_NAME):
            self.skipTest("Unable to find file {} as lexicon".format(
                self.LEXICON_FILE_NAME))
        if not exists(self.GRAMMAR_FILE_NAME):
            self.skipTest("Unable to find file {} as grammar".format(
                self.GRAMMAR_FILE_NAME))
        assert exists(self.PARSE_TREES_FILE_NAME)
        valid,lexiconText = q1utils.sanitizeAndValidateLexicon(
            self.LEXICON_FILE_NAME)
        if not valid:
            self.skipTest("Lexicon {} is invalid.".format(
                self.LEXICON_FILE_NAME))
        valid,grammarText = q1utils.sanitizeAndValidateGrammar(
            self.GRAMMAR_FILE_NAME)
        if not valid:
            self.skipTest("Grammar {} is invalid.".format(
                self.GRAMMAR_FILE_NAME))
        allRules = grammarText + '\n' + lexiconText
        try:
            grammar = CFG.fromstring(allRules)
            self._parser = BottomUpChartParser(grammar)
        except Exception as e:
            self.skipTest(str(e))
    def _runSentenceHelper(self,sentence):
        trees = self._parser.parse(word_tokenize(sentence))
        with open(self.PARSE_TREES_FILE_NAME,'a') as f:
            f.write(sentence)
            f.write('\n')
            for tree in trees:
                f.write(str(tree))
                f.write('\n')
            if not trees:
                f.write('No parses')
            f.write('\n\n')
        return bool(trees)

    PREAMBLE = '%Sam Earle,c2earles,999228438'
                
    def test_parseTheStudentPreferredToSleep(self):
        self.assertTrue(self._runSentenceHelper('the student preferred to sleep'),
            'Could not parse "the student preferred to sleep"')
                    
    def test_parseTheStudentPersuadedTheTeacherToSleep(self):
        self.assertTrue(self._runSentenceHelper('the student persuaded the teacher to sleep'),
            'Could not parse "the student persuaded the teacher to sleep"')
                    
    def test_parseTheStudentPromisedTheTeacherToSleep(self):
        self.assertTrue(self._runSentenceHelper('the student promised the teacher to sleep'),
            'Could not parse "the student promised the teacher to sleep"')
                    
    def test_parseTheStudentExpectedTheTeacherToSleep(self):
        self.assertTrue(self._runSentenceHelper('the student expected the teacher to sleep'),
            'Could not parse "the student expected the teacher to sleep"')
                    
    def test_parseTheTeacherExpectedToSleep(self):
        self.assertTrue(self._runSentenceHelper('the teacher expected to sleep'),
            'Could not parse "the teacher expected to sleep"')
                    
    def test_parseTheTeacherPersuadedTheStudent(self):
        self.assertTrue(self._runSentenceHelper('the teacher persuaded the student'),
            'Could not parse "the teacher persuaded the student"')
                    
    def test_parseTheTeacherPromisedToSleep(self):
        self.assertTrue(self._runSentenceHelper('the teacher promised to sleep'),
            'Could not parse "the teacher promised to sleep"')
                    
    def test_parseTheStudentPreferredTheTeacher(self):
        self.assertTrue(self._runSentenceHelper('the student preferred the teacher'),
            'Could not parse "the student preferred the teacher"')
Ejemplo n.º 4
0
def parse_date_text(text):
	"""Parses a date from some text, returns (TimelineDate, index) where
	text[:index] is the text determined to be the date. If no date can be
	found, returns None. Assumes that date is at the beginning of the string
	with no superfluous characters.
	"""
	text = text.lower()

	date_parser = BottomUpChartParser(parse_cfg(date_grammar_string))

	parses = []

	for date_text in _possible_texts(text):
		parses = date_parser.nbest_parse(date_text)
		if parses:
			break

	if not parses:
		return None
	
	# these are all very closely tied to date_grammar
	def numstr(nume): # returns string of digits
		return ''.join(l for l in nume.leaves() if l.isdigit())
	def month(month): # returns TimePoint
		return TimePoint(month = month_to_num(month[0].node))
	def monthday(monthday): # returns TimePoint
		d = None
		for n in monthday:
			if hasattr(n, 'node'):
				if n.node == 'DAY':
					d = int(numstr(n))
				elif n.node == 'MONTH':
					m = month(n)
				elif n.node == 'MONTHNUM': # only one of MONTH and MONTHNUM should be present
					m = TimePoint(month = int(numstr(n)))
		m.day = d
		return m
	def num(num): # returns TimePoint
		if num[0].node == 'NUME':
			return TimePoint(int(numstr(num[0])))
		elif num[0].node == 'NUMQ':
			return TimePoint(
				[int(numstr(n)) for n in num[0] if n.node == 'NUME'][0],
				year_approx = True)
	def dece(dece): # returns number
		if len(dece) == 1: return int(numstr(dece[0]))
		else: return float(numstr(dece[0]) + '.' + numstr(dece[2]))
	def dec(dec): # returns TimePoint
		if dec[0].node == 'DECE':
			return TimePoint(dece(dec[0]))
		elif dec[0].node == 'DECQ':
			return TimePoint([dece(n) for n in dec[0] if n.node == 'DECE'][0], year_approx = True)
		elif dec[0].node == 'DECQQ':
			return TimePoint(dece(dec[0][0]), year_approx = dece(dec[0][4]))
	def period(period): # returns TimelineDate
		isad = period.node == 'PERIODAD'

		n = num(period[0][0][0])
		if period[0][2].node == 'century': factor = 100
		elif period[0][2].node == 'millenium': factor = 1000

		if isad:
			return TimelineDate(n * factor - factor, n * factor)
		else:
			return TimelineDate(-n * factor + 1, -n * factor + factor + 1)
	def yadyymymd(yad): # returns TimePoint
		# name stands for year AD: year, year month, year month day
		monthtp = None
		daynum = None
		for s in yad.subtrees():
			if s.node == 'NUM':
				yeartp = num(s)
			elif s.node == 'YADPRECISEYEAR':
				# there should never be a NUM and a YADPRECISEYEAR
				yeartp = TimePoint(int(numstr(s)))
			elif s.node == 'MONTH':
				monthtp = month(s)
			elif s.node == 'DAY':
				daynum = int(numstr(s))
		if monthtp: yeartp.month = monthtp.month
		if daynum != None: yeartp.day = daynum
		return yeartp
	def year(year): # returns TimePoint
		if year.node == 'YBC': return -num(year[0]) + 1 # because we are using astronomical years in which x BC is stored as (-x + 1)
		elif year.node == 'YAD': return yadyymymd(year)
	def _has_child_node(n, label):
		return [i for i, c in enumerate(n) if hasattr(c, 'node') and c.node == label]
	def daterange(r): # returns TimelineDate
		if r[0][0].node == 'YAD' and r[2][0].node == 'YAD':
			# for cases like 1832-34. gets parsed as YAD to YAD, but we need
			# to modify the second node
			first = date(r[0])
			second = date(r[2])
			first_str = str(first.start.year)
			second_str = str(second.start.year)
			if len(second_str) < len(first_str):
				second.start.year = int(
					first_str[:len(first_str) - len(second_str)] + second_str)
		elif r[0].node == 'DATE' and r[0][0].node != 'YAD' and r[0][0].node != 'PERIODAD':
			# these cases should work without any modification to either node
			first = date(r[0])
			second = date(r[2])
		else:
			# for cases like 34-12 b.c. or 12 century to 10th century bc. Gets
			# parsed as YAD to YBC or NUM to YBC. replacement_node finds the
			# '34' in the AST for YAD, and puts it in the corresponding
			# location for a copy of the YBC ast. This gets us a fully
			# qualified date for '34' that we can use to create the range
			if r[0][0].node == 'YAD':
				replacement_node = r[0][0].subtrees(filter = lambda x: x.node == 'NUM').next()
			elif r[0][0].node == 'PERIODAD':
				replacement_node = r[0][0][0][0]
			elif r[0].node == 'ORD':
				replacement_node = r[0]
			else:
				raise RuntimeError('unexpected node type in date AST')
			# copy the second date's AST, replace, and get the date
			first_mock = r[2].copy(True) #deepcopy
			parent = next(first_mock.subtrees(lambda t: _has_child_node(t, replacement_node.node)))
			parent[_has_child_node(parent, replacement_node.node)[0]] = replacement_node
			first = date(first_mock)

			second = date(r[2])

		return TimelineDate.span_from_dates(first, second)
	def date(date): # returns TimelineDate
		if date[0].node == 'YAD' or date[0].node == 'YBC':
			return TimelineDate(year(date[0]))
		elif date[0].node == 'PERIODAD' or date[0].node == 'PERIODBC':
			return period(date[0])
	def yearsago(yearsago): # returns TimelineDate
		# assume years ago means years ago from Jan 1 1950
		if yearsago[0].node == 'YAS':
			return TimelineDate(-num(yearsago[0][0]) + 1950)
		elif yearsago[0].node == 'YAR':
			return TimelineDate(-num(yearsago[0][0]) + 1950, -num(yearsago[0][2]) + 1950)
		elif yearsago[0].node == 'KAS':
			return TimelineDate(-dec(yearsago[0][0]) * 1000 + 1950)
		elif yearsago[0].node == 'KAR':
			return TimelineDate(-dec(yearsago[0][0]) * 1000 + 1950, -dec(yearsago[0][2]) * 1000 + 1950)
		elif yearsago[0].node == 'MAS':
			return TimelineDate(-dec(yearsago[0][0]) * 1000000 + 1950)
		elif yearsago[0].node == 'MAR':
			return TimelineDate(-dec(yearsago[0][0]) * 1000000 + 1950, -dec(yearsago[0][2]) * 1000000 + 1950)
	def monthdayrange(r): # returns TimelineDate
		copy_from_first = False
		second = None
		if r[2].node == 'DAY':
			second = TimePoint(day = int(numstr(r[2])))
			copy_from_first = True
		elif r[2].node == 'MONTHDAY':
			second = monthday(r[2])
		elif r[2].node == 'YADYEARMONTH' or r[2].node == 'YADYEARMONTHDAY':
			second = yadyymymd(r[2])

		if r[0].node == 'DAY':
			first = TimePoint(second.year, second.month, int(numstr(r[0])), year_approx = second.year_approx)
		elif r[0].node == 'MONTH':
			first = TimePoint(second.year, month(r[0]).month, year_approx = second.year_approx)
		elif r[0].node == 'MONTHDAY':
			temp = monthday(r[0])
			first = TimePoint(second.year, temp.month, temp.day, year_approx = second.year_approx)

		if copy_from_first:
			second.year = first.year
			second.month = first.month

		return TimelineDate(first, second)
	def monthdayyearrange(r): # returns TimelineDate
		yeartp = TimePoint(int(numstr(r[4])))
		monthdaytp = monthday(r[0])
		return TimelineDate(
			TimePoint(yeartp.year, monthdaytp.month, monthdaytp.day, year_approx = yeartp.year_approx),
			TimePoint(yeartp.year, monthdaytp.month, int(numstr(r[2])), year_approx = yeartp.year_approx))
	def timename(timename): # returns TimelineDate
		if timename[0].node == 'antiquity':
			return TimelineDate(TimePoint(-750), TimePoint(450))


	parse = None
	if len(parses) > 1:
		# ambiguous parses will fall into 3 categories
		# 1. DATE/MONTHDAY ambiguity
		# for a string like December 3:
		#	MONTHDAY (prefer)
		#	DATE -> YAD -> YADYEARMONTH
		# or 3 December
		#
		# 2. DATERANGE/MONTHDAYRANGE ambiguity
		# for a string like 6 June - 3 October 2013
		#	MONTHDAYRANGE -> MONTHDAY TO DATE (prefer)
		#	DATERANGE -> DATE TO DATE -> YAD TO YAD -> YADYEARMONTH TO YADYEARMONTHDAY
		# (this is almost the same thing as problem 1)
		#
		# The other category is something like 3 December 4
		#	YADYEARMONTHDAY -> MONTHDAY ocommadotsp YADYEAR (prefer)
		#	YADYEARMONTHDAY -> YADYEAR ocommadotsp MONTHDAY
		# this should be extremely rare, and we will not bother dealing with these issues
		# this can also happen in DATERANGE

		temp = [p for p in parses if p[0].node == 'MONTHDAY']
		if len(temp) == 1:
			parse = temp[0]
		if not parse:
			temp = [p for p in parses if p[0].node == 'MONTHDAYRANGE']
			if len(temp) == 1:
				parse = temp[0]
		if not parse:
			warnings.warn('not sure how to decide between multiple parses %s' % date_text)
			parse = parses[0]
	else:
		parse = parses[0]

	if parse[0].node == 'DATE':
		result = date(parse[0])
	elif parse[0].node == 'YEARSAGO':
		result = yearsago(parse[0])
	elif parse[0].node == 'DATERANGE':
		result = daterange(parse[0])
	elif parse[0].node == 'MONTH':
		result = TimelineDate(month(parse[0]))
	elif parse[0].node == 'MONTHDAY':
		result = TimelineDate(monthday(parse[0]))
	elif parse[0].node == 'MONTHDAYRANGE':
		result = monthdayrange(parse[0])
	elif parse[0].node == 'MONTHDAYYEARRANGE':
		result = monthdayyearrange(parse[0])
	elif parse[0].node == 'TIMENAME':
		result = timename(parse[0])

	return (result, len(date_text))