Ejemplo n.º 1
0
def untagged_par():
    return SEQ(
        DEBUG('checking for untagged paragraph'), fd_parse.TRACE(False),
        pattern(
            '\s*(?P<paragraph_text>(?![^<]*\[Adjourned)([^<]|<i>|</i>)+)\s*(</ul>|</p>)?'
        ), START('paragraph'), ATTRIBUTES(type='"untagged"'),
        fd_parse.TRACE(False, envlength=512), OUT('$paragraph_text'),
        END('paragraph'))
Ejemplo n.º 2
0
def plaindate(today='$today', rtn='date'):
    return SEQ(
        POSSIBLY(dayname),
        dayordinal,
        monthname,
        #		SET('year', 'str(%s.year)' % today),
        POSSIBLY(year),
        SET('year', '$year and $year or str(%s.year)' % today),
        SET(rtn, 'datetime.date(int($year), monthnum($monthname), int($day))'))
Ejemplo n.º 3
0
def paragraph_text(partype='plain', no=None):
    #	if no:
    #		attrmap={'no' : no}
    #	else:
    #		attrmap=None

    return SEQ(
        pattern('(?P<paragraph_text>([^<]|<i>|</i>)+)'),
        ELEMENT('paragraph',
                body='$paragraph_text',
                type='%s' % repr(partype)
                #			attrlit={'type' : partype},
                #			attrmap=attrmap
                ))
Ejemplo n.º 4
0
# three hours and 27 minutes
timequantump = '(an hour and a (half|quarter)|((a quarter|three quarters) of|half) an hour|%(engnumber60p)s minutes|%(engnumber12p)s (and a half hours|hours( and %(engnumber60p)s minutes)?))' % {
    'engnumber12p': engnumber12p,
    'engnumber60p': engnumber60p
}

DEFINE('timequantum', pattern(timequantump))

# English times, eg "three minutes to four o'clock"

archtimep = '(twelve (noon|midnight)|(a quarter past|half-past|a quarter to|' + engnumber60p + ' minutes (to|past)|)\s*' + engnumber12p + '(\s*o\'\s*clock))?'

archtime = DEFINE(
    'archtime',
    SEQ(pattern('\s*(?P<archtime>' + archtimep + ')(?i)'),
        ELEMENT('time', archtime='$archtime')))

# Date handling

monthnamep = '(January|February|March|April|May|June|July|August|September|October|November|December)'
daynamep = '(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)'
ordinalp = '(st|nd|rd|th)'
datep = '\d+' + ordinalp + '\s+' + monthnamep + '\s+\d+(?i)'

dayname = pattern('\s*(?P<dayname>' + daynamep + ')\s*')

monthname = pattern('\s*(?P<monthname>' + monthnamep + ')\s*')

year = pattern('(?P<year>\d{4})\s*')

dayordinal = pattern('\s*(?P<day>\d+)' + ordinalp + '\s*')
Ejemplo n.º 5
0
import os
import os.path
import re
import fd_parse
from fd_dates import *

from fd_parse import SEQ, OR, ANY, POSSIBLY, IF, START, END, OBJECT, NULL, OUT, DEBUG, STOP, FORCE, CALL, ATTRIBUTES, ELEMENT, DEFINE, pattern, tagged, plaintextpar, plaintext

sys.path.append("../")
from xmlfilewrite import WriteXMLHeader
from contextexception import ContextException
from miscfuncs import toppath

splitparagraphs = ANY(
    SEQ(
        pattern(
            '\s*(<p([^>]*?)>(<ul>)?)(?P<partext>([^<]|<i>|</i>)*)(</ul>)?(</p>)?'
        ), ELEMENT('paragraph', body='$partext')))


def namepattern(label='name'):
    return "(?P<" + label + ">[-A-Za-z .']+)"


# Patterns specific to Votes and Proceedings that are used frequently

DEFINE('mp', pattern('[-A-Za-z .]+'))
DEFINE('act', pattern('[-a-z.,A-Z0-9()\s]*?'), fragment=True)
DEFINE('speaker', pattern('(Deputy )?Speaker'))
DEFINE('number', pattern('\d+'))
DEFINE('ordinal', pattern('1st|2nd|3rd|\d+th'))
DEFINE('text', pattern('[a-z .?;,()]*?'), fragment=True)