def untagged_par(): return SEQ( DEBUG('checking for untagged paragraph'), fd_parse.TRACE(False), pattern( '\s*(?P<paragraph_text>(?![^<]*\[Adjourned)([^<]|<i>|</i>)+)\s*(</ul>|</p>)?' ), START('paragraph'), ATTRIBUTES(type='"untagged"'), fd_parse.TRACE(False, envlength=512), OUT('$paragraph_text'), END('paragraph'))
def plaindate(today='$today', rtn='date'): return SEQ( POSSIBLY(dayname), dayordinal, monthname, # SET('year', 'str(%s.year)' % today), POSSIBLY(year), SET('year', '$year and $year or str(%s.year)' % today), SET(rtn, 'datetime.date(int($year), monthnum($monthname), int($day))'))
def paragraph_text(partype='plain', no=None): # if no: # attrmap={'no' : no} # else: # attrmap=None return SEQ( pattern('(?P<paragraph_text>([^<]|<i>|</i>)+)'), ELEMENT('paragraph', body='$paragraph_text', type='%s' % repr(partype) # attrlit={'type' : partype}, # attrmap=attrmap ))
# three hours and 27 minutes timequantump = '(an hour and a (half|quarter)|((a quarter|three quarters) of|half) an hour|%(engnumber60p)s minutes|%(engnumber12p)s (and a half hours|hours( and %(engnumber60p)s minutes)?))' % { 'engnumber12p': engnumber12p, 'engnumber60p': engnumber60p } DEFINE('timequantum', pattern(timequantump)) # English times, eg "three minutes to four o'clock" archtimep = '(twelve (noon|midnight)|(a quarter past|half-past|a quarter to|' + engnumber60p + ' minutes (to|past)|)\s*' + engnumber12p + '(\s*o\'\s*clock))?' archtime = DEFINE( 'archtime', SEQ(pattern('\s*(?P<archtime>' + archtimep + ')(?i)'), ELEMENT('time', archtime='$archtime'))) # Date handling monthnamep = '(January|February|March|April|May|June|July|August|September|October|November|December)' daynamep = '(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)' ordinalp = '(st|nd|rd|th)' datep = '\d+' + ordinalp + '\s+' + monthnamep + '\s+\d+(?i)' dayname = pattern('\s*(?P<dayname>' + daynamep + ')\s*') monthname = pattern('\s*(?P<monthname>' + monthnamep + ')\s*') year = pattern('(?P<year>\d{4})\s*') dayordinal = pattern('\s*(?P<day>\d+)' + ordinalp + '\s*')
import os import os.path import re import fd_parse from fd_dates import * from fd_parse import SEQ, OR, ANY, POSSIBLY, IF, START, END, OBJECT, NULL, OUT, DEBUG, STOP, FORCE, CALL, ATTRIBUTES, ELEMENT, DEFINE, pattern, tagged, plaintextpar, plaintext sys.path.append("../") from xmlfilewrite import WriteXMLHeader from contextexception import ContextException from miscfuncs import toppath splitparagraphs = ANY( SEQ( pattern( '\s*(<p([^>]*?)>(<ul>)?)(?P<partext>([^<]|<i>|</i>)*)(</ul>)?(</p>)?' ), ELEMENT('paragraph', body='$partext'))) def namepattern(label='name'): return "(?P<" + label + ">[-A-Za-z .']+)" # Patterns specific to Votes and Proceedings that are used frequently DEFINE('mp', pattern('[-A-Za-z .]+')) DEFINE('act', pattern('[-a-z.,A-Z0-9()\s]*?'), fragment=True) DEFINE('speaker', pattern('(Deputy )?Speaker')) DEFINE('number', pattern('\d+')) DEFINE('ordinal', pattern('1st|2nd|3rd|\d+th')) DEFINE('text', pattern('[a-z .?;,()]*?'), fragment=True)