Exemple #1
0
    def regidcodes(self, minhgid, sdate, qnumsseen):
        # find minimal qnum which will be used as the basis
        self.qnums.sort()
        if not self.qnums:
            print self.headingqb.stext[0]
            for ques in self.queses:
                print ques.stext
            raise ContextException('missing qnums on question')
        basegidq = 'uk.org.publicwhip/wrans/%s.%s' % (sdate, self.qnums[0])
        self.headingqb.qGID = basegidq + ".h"  # this is what we link to
        for rqnum in self.qnums[1:]:  # the mapping for the other qnums
            self.altheadinggids.append('uk.org.publicwhip/wrans/%s.%s.h' %
                                       (sdate, rqnum))

        # renumber the parts of the question (which aren't going to be linked to anyway)
        for i in range(len(self.queses)):
            self.queses[i].qGID = "%s.q%d" % (basegidq, i)
        for i in range(len(self.replies)):
            self.replies[i].qGID = "%s.r%d" % (basegidq, i)

        # make sure all qnums are new
        for qnum in self.qnums:
            if qnum in qnumsseen:
                print "repeated qnum:", qnum
                raise ContextException('repeated qnum', None, qnum)
            qnumsseen[qnum] = 1

        # this value is used for labelling the major heading.
        # high probability that the value is stable, but it won't be used for linking
        if not minhgid or (basegidq < minhgid):
            minhgid = basegidq
        return minhgid
    def __init__(self, date, stex):
        self.lastdate = ''
        self.toklist = []
        self.sdate = date

        stex = re.sub('&(?!amp;)', '&amp;', stex)
        # separate out any qnums at end of paragraph
        self.rmqnum = reqnum.search(stex)
        if self.rmqnum:
            stex = stex[:self.rmqnum.span(0)[0]]

        # separate out qnums stuffed into front of paragraph (by the grabber of the speakername)
        frqnum = refqnum.match(stex)
        if frqnum:
            if self.rmqnum:
                raise ContextException(
                    'Found question number [%s] in para, but already found [%s] at end (this probably just means it is being quoted, and you just need to change [] to ().'
                    % (frqnum.group(1), self.rmqnum.group(1)))
            self.rmqnum = frqnum
            stex = stex[frqnum.span(0)[1]:]
            stex_nohtml = re.sub('<[^>]*>', '', stex)
            if len(stex_nohtml) < 10:
                raise ContextException(
                    'Removing question number from para appears to have removed all text (this probably just means a footnote marker is using [], just change to ()).'
                )

        self.TokenizePhraseRecurse(date, stex, 0)
Exemple #3
0
def FilterWMSSpeakers(fout, text, sdate):
        stampurl = StampUrl(sdate)

        for fss in recomb.split(text):
                stampurl.UpdateStampUrl(fss)

                # speaker detection
                speakerg = respeakervals.match(fss)
                if speakerg:
                        anamestamp = speakerg.group(1) or speakerg.group(2) or ""
                        spstr = string.strip(speakerg.group(3))
                        spstrbrack = speakerg.group(4)
                        if not spstr:
                                continue
                        try:
                                #print "spstr", spstr, ",", spstrbrack
                                result = memberList.matchwmsname(spstr, spstrbrack, sdate)
                        except Exception, e:
                                raise ContextException(str(e), stamp=stampurl, fragment=fss)

                        # put record in thisplace
                        spxm = '%s<speaker %s>%s</speaker>\n' % (anamestamp, result.encode("latin-1"), spstr)
                        fout.write(spxm)
                        continue

                # nothing detected
                # check if we've missed anything obvious
                if recomb.match(fss):
                        raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
                if remarginal.search(fss):
                        raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

                fout.write(fss)
Exemple #4
0
def NewGrabLordDivisionProced(qbp, qbd):
    if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1:
        print qbp.stext
        raise ContextException("previous to division not speech",
                               stamp=qbp.sstampurl)

    iskim = 1
    while iskim <= len(qbp.stext) and not redivisionon.match(
            qbp.stext[-iskim]):
        iskim = iskim + 1
    if iskim > len(qbp.stext):
        raise ContextException("Could not find Division 'title'",
                               stamp=qbp.sstampurl)

    hdg = renewlorddiv.match(qbp.stext[-iskim + 1])
    if not hdg:
        print qbp.stext[-iskim + 1]
        raise ContextException("no totals before division",
                               stamp=qbp.sstampurl)

    # if previous thing is already a no-speaker, we don't need to break it out
    # (the coding on the question put is complex and multilined)
    if re.search('nospeaker="true"', qbp.speaker):
        qbp.stext = SubsPWtextset(qbp.stext)
        return None

    # copy the two lines into a non-speaking paragraph.
    qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl)
    qbdp.typ = 'speech'
    qbdp.stext = SubsPWtextset(qbp.stext[-iskim:])

    # trim back the given one by two lines
    qbp.stext = qbp.stext[:-iskim]

    return qbdp
Exemple #5
0
def StripWransHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    i = 0
    if (headspeak[i][0] != 'Initial') or headspeak[i][2]:
        print headspeak[0]
        raise ContextException('non-conforming Initial heading ')
    i += 1

    # import pdb;pdb.set_trace()
    if (not re.match(
            '(?:<stamp aname="[^"]*"/>)*written answers?(?: to questions?)?(?i)',
            headspeak[i][0])) or headspeak[i][2]:
        if not re.match('The following answers were received.*',
                        headspeak[i][0]):
            pass
# print headspeak[i]
    else:
        i += 1

    givendate = string.replace(headspeak[i][0], "&nbsp;", " ")
    givendate = re.sub("</?i>", "", givendate)

    gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$', givendate)
    if gd:
        givendate = gd.group(1)
    if (not re.match('(?i)(?:<stamp[^>]*>)*(?:<i>)?\s*(?:The following answers were|Answers) received.*', headspeak[i][0]) and
           not re.match('(?:<stamp[^>]*>)?The following question was answered on.*', headspeak[i][0]) and \
     (sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[i][2]:
        if (not parlPhrases.wransmajorheadings.has_key(
                headspeak[i][0])) or headspeak[i][2]:
            print headspeak[i]
            raise ContextException('non-conforming second heading',
                                   stamp=None,
                                   fragment=headspeak[i][0])
    else:
        i += 1

    # find the url and colnum stamps that occur before anything else
    stampurl = StampUrl(sdate)
    for j in range(0, i):
        stampurl.UpdateStampUrl(headspeak[j][0])
        stampurl.UpdateStampUrl(headspeak[j][1])

# Later editions seem to miss first column number, sigh
    if not stampurl.stamp:
        for speeches in headspeak:
            text = ''.join([speech[1] for speech in speeches[2]])
            m = re.search('colnum="(\d+)W"', text)
            if m:
                stampurl.UpdateStampUrl('<stamp coldate="%s" colnum="%dW"/>' %
                                        (sdate, int(m.group(1)) - 1))
                break

    if not stampurl.stamp or not stampurl.pageurl or not stampurl.aname:
        raise ContextException('missing stamp url at beginning of file')
    return (i, stampurl)
    def EndHeading(self, nextheading, nextmajor=None):
        self.EndSpeech()

        if (self.heading == 'Initial') and self.shspeak:
            print 'Speeches without heading'

        # lost heading signals are found elswhere?

        # concatenate unspoken text with the title if it's a dangle outside heading
        # e.g. In 2003-01-15 we have heading "Birmingham Northern Relief Road "
        # with extra bit "(Low-noise Tarmac)" to pull in.
        if not re.match('(?:<[^>]*?>|\s)*$', self.unspoketext):
            # We deliberately don't put "." in to avoid matching "19." before paragraph starts
            gho = re.match(
                '(\s*[()A-Za-z\-,\'\"/&#; 0-9]+)((?:<[^>]*?>|\s)*)$',
                self.unspoketext)
            if gho and not renotheadingmarg.search(self.unspoketext):
                self.heading = self.heading + ' ' + gho.group(1)
                self.heading = re.sub("\s+", " ", self.heading)
                #self.unspoketext = gho.group(2)
                # print "merged dangling heading %s" % (self.heading)
                if len(self.heading) > 100:
                    raise ContextException(
                        "Suspiciously long merged heading part - is it OK? %s"
                        % self.heading,
                        stamp=None,
                        fragment=self.heading)

        self.shtext.append(
            (self.heading, self.unspoketext, self.shspeak, self.major))

        self.heading = nextheading
        self.major = nextmajor
        self.unspoketext = ''  # for holding colstamps
        self.shspeak = []
Exemple #7
0
    def MatchRevName(self, fss, sdate, stampurl):
        assert fss
        lfn = re.match(
            '(.*?)(?: of (.*?))?, ? ?((?:L|B|Abp|Bp|V|E|D|M|C|Ly)\.?)$', fss)
        if not lfn:
            print "$$$%s$$$" % fss
            raise ContextException("No match of format in MatchRevName",
                                   stamp=stampurl,
                                   fragment=fss)
        shorttitle = lfn.group(3)
        if shorttitle[-1] != '.':
            shorttitle += "."
        ltitle = titleconv[shorttitle]
        llordname = string.replace(lfn.group(1), ".", "")
        llordname = string.replace(llordname, "&#039;", "'")
        llordname = re.sub("^De ", "de ", llordname)
        fullname = '%s %s' % (ltitle, llordname)
        llordofname = ""
        if lfn.group(2):
            llordofname = string.replace(lfn.group(2), ".", "")
            fullname = '%s of %s' % (fullname, llordofname)

        if fullname in self.aliases:
            return self.aliases[fullname]

        return self.GetLordID(ltitle, llordname, llordofname, "", stampurl,
                              sdate, True)
Exemple #8
0
def GrabWestminDivisionInterruptProced(qbp, rawtext):
    if len(qbp.stext) < 3:
        return None
    iskip = 0
    if re.search("italic.*?>on resuming&\S*</p>(?i)", qbp.stext[-1]):
        if not re.search(
                "italic.*?>(?:sitting )?(?:suspended|adjourned)(?: for (?:a division|divisions) in the house)?[\.\s]*(?i)",
                qbp.stext[-2]):
            raise ContextException(
                'failed to detect sitting suspended interruption',
                fragment=qbp.stext[-2])
        iskip = -2

    elif re.search("italic.*?>sitting suspended(?: for| until| till|\.)(?i)",
                   qbp.stext[-1]):
        iskip = -1

    # copy the lines into a non-speaking paragraph.
    if iskip:
        dumtext = re.sub(
            '<p>(?:<stamp aname="[^"]*?"/>)?<i>sitting suspended.*(?si)', '',
            rawtext)
        # Why didn't I make a note of why I did the following lines? Must be something to do with the timestamps...
        s = copy.copy(qbp.sstampurl)
        qbdp = qspeech('nospeaker="true"', dumtext, s)
        qbdp = qspeech('nospeaker="true"', "", s)
        qbdp.typ = 'speech'
        qbdp.stext = qbp.stext[iskip:]
        # trim back the given one by two lines
        qbp.stext = qbp.stext[:iskip]
        return qbdp
    return None
Exemple #9
0
    def GetLordIDfname(self, name, loffice, sdate, stampurl=None):
        name = re.sub("^The ", "", name)
        name = name.replace(' Of ', ' of ')

        if name in self.aliases:
            return self.aliases[name]

        if name == "Queen":
            return "uk.org.publicwhip/person/13935"

        hom = honcompl.match(name)
        if not hom:
            raise ContextException("lord name format failure on '%s'" % name,
                                   stamp=stampurl,
                                   fragment=name)

        # now we have a speaker, try and break it up
        ltit = hom.group(1)
        if not ltit:
            ltit = hom.group(2)
            lname = hom.group(3)
        else:
            lname = ""

        ltit = re.sub("  ", " ", ltit)
        lplace = ""
        if hom.group(4):
            lplace = re.sub("  ", " ", hom.group(4))
            lplace = rehonorifics.sub("", lplace)

        lname = re.sub("^De ", "de ", lname)
        lname = rehonorifics.sub("", lname)

        return self.GetLordID(ltit, lname, lplace, loffice, stampurl, sdate,
                              False)
Exemple #10
0
def StripDebateHeading(hmatch, ih, headspeak, bopt=False):
    # print "StripDebateHeading", hmatch
    reheadmatch = '(?:<stamp aname="[^"]*"/>)*\s*' + hmatch
    if (not re.match(reheadmatch, headspeak[ih][0])) or headspeak[ih][2]:
        if bopt:
            return ih
        print "headspeak", headspeak[ih][:2]
        if headspeak[ih][2]:
            raise ContextException(
                'non-conforming section after "%s" heading. FOR EXAMPLE: "in the chair" missing <h4><center> '
                % hmatch,
                fragment=headspeak[ih][0])
        print reheadmatch
        print headspeak[ih][2]
        raise ContextException('non-conforming "%s" heading ' % hmatch,
                               fragment=headspeak[ih][0])
    return ih + 1
Exemple #11
0
def FixHTMLEntities(stex, signore='', stampurl=None):
    res = string.join(FixHTMLEntitiesL(stex, signore, stampurl), '')
    try:
        res = res.decode('utf-8')
        return res.encode("latin-1")
    except Exception, e:
        print "Encoding problem with:", res
        raise ContextException(str(e), stamp=stampurl, fragment=res)
Exemple #12
0
def StripDebateHeading(hmatch, ih, headspeak, bopt=False):
    reheadmatch = '(?:<stamp aname="[^"]*"/>)*' + hmatch
    if (not re.match(reheadmatch, headspeak[ih][0])) or headspeak[ih][2]:
        if bopt:
            return ih
        print "\n", headspeak[ih]
        raise ContextException('non-conforming "%s" heading ' % hmatch)
    return ih + 1
Exemple #13
0
def RunVotesFilters(fout, text, sdate, sdatever):
    (s, env, result) = parsevotetext(text, sdate)

    if result.success:
        result.delta.apply(None).writexml(fout, encoding="ISO-8859-1")
#                WriteXMLHeader(fout)
#                fout.write(result.text())
    else:
        raise ContextException("Failed to parse vote\n%s\n%s" %
                               (result, s[:128]))
Exemple #14
0
def MpTellerList(fsm, vote, stampurl, sdate):
    res = []
    for fss in fsm:
        if fss == '</b>':
            continue  # The end </b> on Tellers for the (Ayes|Noes):
        if fss == '<b> and</b>':
            continue  # The 'and' now gets a paragraph of its own
        while fss:  # split by lines, but linefeed sometimes missing
            gftell = re.match(
                '\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$',
                fss)
            if not gftell:
                raise ContextException("no match on teller line",
                                       stamp=stampurl,
                                       fragment=fss)

            fssf = gftell.group(1)
            fssfcons = gftell.group(2)
            fss = gftell.group(3)

            if len(res) >= 2:
                print fsm
                raise ContextException(' too many tellers ',
                                       stamp=stampurl,
                                       fragment=fss)

            # It always is
            if fssf == 'Mr. Michael Foster':
                fssfcons = 'Worcester'

            (mpid, remadename, remadecons) = memberList.matchfullnamecons(
                fssf.strip(), fssfcons, sdate)
            #print fssf, " ++> ", remadename.encode("latin-1")
            if not mpid:
                raise ContextException("teller name bad match",
                                       stamp=stampurl,
                                       fragment=fssf)
            res.append(
                '\t<mpname person_id="%s" vote="%s" teller="yes">%s</mpname>' %
                (mpid, vote, FixHTMLEntities(fssf)))

    return res
Exemple #15
0
def GrabLordDivisionProced(qbp, qbd):
    if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1:
        print qbp.stext
        raise ContextException("previous to division not speech",
                               stamp=qbp.sstampurl)

    hdg = relorddiv.match(qbp.stext[-1])
    if not hdg:
        print qbp.stext[-1]
        raise ContextException("no lordships divided before division",
                               stamp=qbp.sstampurl)

    # if previous thing is already a no-speaker, we don't need to break it out
    # (the coding on the question put is complex and multilined)
    if re.search('nospeaker="true"', qbp.speaker):
        qbp.stext = SubsPWtextset(qbp.stext)
        return None

    # look back at previous paragraphs and skim off a part of what's there
    # to make a non-spoken bit reporting on the division.
    iskim = 1
    if not resaidamend.match(qbp.stext[-2]):
        print qbp.stext[-2]
        raise ContextException("no on said amendment",
                               stamp=qbp.sstampurl,
                               fragment=qbp.stext[-2])
    iskim = 2

    # copy the two lines into a non-speaking paragraph.
    qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl)
    qbdp.typ = 'speech'
    qbdp.stext = SubsPWtextset(qbp.stext[-iskim:])

    # trim back the given one by two lines
    qbp.stext = qbp.stext[:-iskim]

    return qbdp
Exemple #16
0
def LordsDivisionParsingPart(divno, unspoketxt, stampurl, sdate):
	# find the ending of the division and split it off.
	gquesacc = re.search(regenddiv, unspoketxt)
	if gquesacc:
		divtext = unspoketxt[:gquesacc.start(1)]
		unspoketxt = unspoketxt[gquesacc.start(1):]
                unspoketxt = re.sub(':ENDDIVISION:', '', unspoketxt)
	elif sdate > '2008-12-01': # Sigh XXX
		m = re.match('.*, [A-Z]\.</p>(?s)', unspoketxt)
                if not m:
			m = re.match('.*<br>(?s)', unspoketxt)
                divtext = m.group()
                unspoketxt = unspoketxt[m.end():]
	else:
		divtext = unspoketxt
		print "division missing %s" % regenddiv
		print unspoketxt
		print "is there a linefeed before the </center> on the CONTENTS?"
		raise ContextException("Division missing resolved in the", stamp=stampurl, fragment="Division") # newly added
		unspoketxt = ''

	divtext = re.sub(' style="margin-bottom:[^"]*"', '', divtext)

	# Add a division object (will contain votes and motion text)
	spattr = 'nospeaker="true" divdate="%s" divnumber="%s"' % (sdate, divno)
	qbd = qspeech(spattr, divtext, stampurl)
	qbd.typ = 'division' # this type field seems easiest way

	if not stampurl.timestamp:
		raise ContextException("Division missing any timestamps; need to put one in to make it consistent.  like <h5>2.44 pm</h5>", stamp=stampurl, fragment="Division")

	# filtering divisions here because we may need more sophisticated detection
	# of end of division than the "Question accordingly" marker.
	qbd.stext = LordsFilterDivision(qbd.text, stampurl, sdate)

	return (unspoketxt, qbd)
Exemple #17
0
def SplitParaIndents(text, stampurl):
    dell = SplitParaSpace(text, stampurl)
    #print "dell", dell

    res = []
    resdent = []
    bIndent = 0
    for i in range(len(dell)):
        if (i % 2) == 0:
            for sp in dell[i]:
                if re.match('(?:<ul><ul>)?<ul>(?i)', sp):
                    if bIndent == 1:
                        print dell[i - 1:i + 1]
                        raise ContextException(' already indented ',
                                               stamp=stampurl,
                                               fragment=sp)
                    bIndent = 1
                elif re.match('(?:</ul></ul>)?</ul>(?i)', sp):
                    # no error
                    #if not bIndent:
                    #	raise Exception, ' already not-indentented '
                    bIndent = 0
                elif re.match('<p style="margin-left: ?[23]0px;">', sp):
                    bIndent = 2
                elif bIndent == 2 and re.match('</p>', sp):
                    bIndent = 0
            continue

        # we have the actual text between the spaces
        # we might have full italics indent style
        # (we're ignoring fonts for now)

        # separate out italics type paragraphs
        tex = dell[i]
        cindent = bIndent > 0 and 1 or 0

        qitbod = re.match('<i>([\s\S]*?)</i>[.:]?$', tex)
        if qitbod:
            tex = qitbod.group(1)
            cindent = cindent + 2

        res.append(tex)
        resdent.append(cindent)

    #if bIndent:
    #	print text
    #	raise ' still indented after last space '
    return (res, resdent)
Exemple #18
0
def ParseRow(srow, hdcode, stampur):
    # build up the list of entries for this row
    Lscols = ['\t\t<tr> ']
    for spcol in recolsplit.split(srow):
        col = recolmatch.match(spcol)
        if col:
            colspan = ''
            rowspan = ''
            if col.group(2):
                colspan = ' colspan="%s"' % col.group(2)
            if col.group(5):
                colspan = ' colspan="%s"' % col.group(5)
            if col.group(3):
                rowspan = ' rowspan="%s"' % col.group(3)
            talign = ''
            if col.group(1):
                talign = ' align="%s"' % col.group(1)
            if col.group(4):
                talign = ' align="%s"' % col.group(4)
            Lscols.append('<%s%s%s%s>' % (hdcode, colspan, rowspan, talign))

            coltext = re.sub('\n', ' ', col.group(6))
            coltext = re.sub(
                '</?font[^>]*>|</?p[^>]*>|</?center>|</?B>|</?ul>(?i)', '',
                coltext)
            coltext = re.sub('^(?:<br>|\s)(?i)', '', coltext)
            coltext = re.sub('(?:<br>|\s)$(?i)', '', coltext)
            content = FixHTMLEntitiesL(coltext, '', stampurl=stampur)
            Lscols.extend(content)
            Lscols.append('</%s> ' % hdcode)

        # check that the outside text contains nothing but bogus close column tags
        elif not re.match('(?:</t[dh]>|</font>|\s)*$(?i)', spcol):
            print "spcol:", spcol
            print "srow:", srow
            print "srowsplit:", recolsplit.split(srow)
            raise ContextException("non column text",
                                   stamp=stampur,
                                   fragment=srow)
    Lscols.append('</tr>')
    return string.join(Lscols, '')
Exemple #19
0
def ExtractQnum(tex, stampurl):

    qn = re.match('(.*?)\s*\[?((?:HL)?\d+R?)\]$', tex)
    if not qn:
        return (
            tex, '0'
        )  # default when no qnum is found.  the 0 qnums are detected elswhere (should have used "0error") in MeasureBlockSimilarity for gidmatching

    text = qn.group(1)
    isqn = re.search('\[((?:HL)?(\d+)R?)\]', text)
    if isqn:
        nqn = string.atoi(isqn.group(2))
        if text.find("<ok-extra-qnum>") >= 0:
            text = text.replace("<ok-extra-qnum>", "", 1)
        elif nqn >= 1980 and nqn <= 2020:
            pass
        else:
            print tex
            print 'A colnum may be removing a necessary <p> tag before the (2)'
            raise ContextException('qnum in middle of index block',
                                   stamp=stampurl,
                                   fragment=isqn.group(1))
    return (text, qn.group(2))
Exemple #20
0
def FilterLordsColtime(fout, text, sdate):
	colnum = -1
	time = ''

	stampurl = StampUrl(sdate)
	previoustime = []
	for fss in recomb.split(text):
		# column number type

		# we need some very elaboirate checking to sort out the sections, by
		# titles that are sometimes on the wrong side of the first column,
		# and by colnums that miss the GC code in that section.
		# column numbers are also missed during divisions, and this exception
		# should be detected and noted.

		# That implies that this is the filter which detects the boundaries
		# between the standard four sections.
		columng = recolumnumvals.match(fss)
		if columng:
			# check date
			ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
			if sdate != ldate:
				raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stampurl, fragment=fss)

			# check number
                        # ltype = columng.group(2)
			lcolnum = string.atoi(columng.group(3))
			if lcolnum == colnum - 1:
				pass	# spurious decrementing of column number stamps
			elif lcolnum == colnum:
				pass	# spurious repeat of column number stamps
			# good (we get skipped columns in divisions)
			elif (colnum == -1) or (colnum + 1 <= lcolnum <= colnum + 5):  # was 2 but this caused us to miss ones
				colnum = lcolnum
				fout.write('<stamp coldate="%s" colnum="%s%s"/>' % (sdate, colnum, ""))

			# column numbers do get skipped during division listings
			else:
				pass #print "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss)
				#raise Exception, "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss)

			#print (ldate, colnum, lindexstyle)
			continue

		timeg = retimevals.match(fss)
		if timeg:
			time = timeg.group(1)
			if not re.match('(?:</h5>|</st>)(?i)', time):
				time = TimeProcessing(time, previoustime, False, stampurl)
				fout.write('<stamp time="%s"/>' % time)
				if time:
                                        previoustime.append(time)
			continue

		# special lift a time out of the heading
		regtime3 = regtime3vals.match(fss)
		if regtime3:
			fout.write(fss) # put this heading back into the flow of text
			assert not previoustime
			lntimematch = re.match("(half[\- ]past )?(\w+)(-thirty)?$", regtime3.group(1))
			lnhour = lntimematch and lntimematch.group(2)
			# strange way to do it, but I'm keeping tab on examples, and the transition between am and pm
			if lnhour == "two":
				lntimep = "2:%s pm"
			elif lnhour == "three":
				lntimep = "3:%s pm"
			elif lnhour == "six":
				lntimep = "6:%s pm"
			elif lnhour == "nine":
				lntimep = "9:%s am"
			elif lnhour == "eleven":
				lntimep = "11:%s am"
			elif lnhour == "ten":
				lntimep = "10:%s am"
			else:
				print "-------------'%s'" % regtime3.group(1)
				assert False
			assert not lntimematch.group(1) or not lntimematch.group(3)
			ntime = lntimep % ((lntimematch.group(1) or lntimematch.group(3)) and "30" or "00")
			time = TimeProcessing(ntime, previoustime, False, stampurl)
			fout.write('<stamp time="%s"/>' % time)
			continue

		# anchor names from HTML <a name="xxx">
		anameg = reanamevals.match(fss)
		if anameg:
			aname = anameg.group(1)
			fout.write('<stamp aname="%s"/>' % aname)
			stampurl.aname = aname
			continue

		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			print "$$$", fss, "$$-$"
			raise ContextException(' regexpvals not general enough ', stamp=stampurl, fragment=fss) # a programming error between splitting and matching
		if remarginal.search(fss):
			print remarginal.search(fss).group(0)
			lregcolumnum6 = '<p>\s*</ul>\s*<a name="column_\d+">(?:</a>)?\s*<b>[^:<]*:\s*column\s*\d+\s*</b></p>\s*<ul><font size=3>(?i)'
			print re.findall(lregcolumnum6, fss)
			#print fss
			raise ContextException(' marginal coltime detection case ', stamp=stampurl, fragment=fss)
		fout.write(fss)
Exemple #21
0
 def parse_day(self, input):
     self.heading = {}
     self.pre_heading = {}
     self.speaker = {}
     self.text = ''
     timestamp = ''
     j = json.loads(input)
     if 'AllHansardComponentsList' in j:
         j = j['AllHansardComponentsList']['HansardComponent']
     for line in j:
         text = (line['ComponentText'] or '').replace('&', '&amp;')
         if not text:
             print "WARNING: Empty line: %s" % line
         elif line['ComponentType'] == 'Document Title':
             assert re.match(
                 '(Plenary|PLE), %s/%s/%s$' %
                 (self.date[8:10], self.date[5:7], self.date[0:4]), text)
         elif line['ComponentType'] == 'Time':
             timestamp = self.time_period(text)
         elif line['ComponentType'] == 'Header':
             if line['ComponentHeaderId'] in (0, 1, '0', '1'):
                 typ = 'major'
             elif line['ComponentHeaderId'] in (2, '2'):
                 typ = 'minor'
             else:
                 raise Exception("Unknown ComponentHeaderId %s" %
                                 line['ComponentHeaderId'])
             if self.heading and self.heading['type'] == typ:
                 self.pre_heading = {
                     'level': line['ComponentHeaderId'],
                     'text': self.heading['text']
                 }
                 self.heading['text'] += ' &#8212; %s' % text
             else:
                 self.display_speech()
                 self.speaker = {'ts': timestamp}
                 if self.pre_heading and self.pre_heading['level'] == line[
                         'ComponentHeaderId']:
                     text = '%s &#8212; %s' % (self.pre_heading['text'],
                                               text)
                 elif self.pre_heading and self.pre_heading['level'] > line[
                         'ComponentHeaderId']:
                     self.pre_heading = {}
                 self.heading = {'text': text, 'ts': timestamp, 'type': typ}
         elif re.match(
                 'Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$',
                 line['ComponentType']):
             # RelatedItemId here is the NI speaker ID. We could use that!
             # But for now, carry on going by name as all that code exists.
             self.display_speech()
             speaker = text.replace(':', '')
             id, stri = memberList.match(speaker, self.date)
             self.speaker = {'id': stri, 'ts': timestamp}
         elif line['ComponentType'] == 'Speaker (Special)' or line[
                 'ComponentType'] == 'Speaker (GuestSpeaker)':
             self.display_speech()
             speaker = text.replace(':', '')
             self.speaker = {'name': speaker, 'ts': timestamp}
         elif line['ComponentType'] == 'Question':
             self.display_speech()
             m = re.match('(T?[0-9]+\. )?(.*?) asked', text)
             id, stri = memberList.match(m.group(2), self.date)
             self.speaker = {'id': stri, 'ts': timestamp}
             self.text += "<p>%s</p>\n" % text
         elif line['ComponentType'] == 'Quote':
             self.text += '<p class="indent">%s</p>\n' % text
         elif line['ComponentType'] in ('Plenary Item Text',
                                        'Procedure Line'):
             match = re.match(
                 'The Assembly met at ((\d\d?):(\d\d?) (am|pm)|12 noon)',
                 text)
             if match:
                 timestamp = self.time_period(text)
                 self.speaker['ts'] = timestamp
             self.text += '<p class="italic">%s</p>\n' % text
         elif line['ComponentType'] == 'Bill Text':
             self.text += text.replace(
                 '<p>', '<p class="indent">')  # Already is HTML
         elif line['ComponentType'] in ('Division', 'Spoken Text'):
             text = re.sub('\s*<BR />\s*<BR />\s*(?i)', '</p>\n<p>', text)
             text = re.sub('WIDTH=50%', 'WIDTH="50%"', text)
             self.text += '<p>%s</p>\n' % text
         else:
             raise ContextException("Uncaught Component Type! %s" %
                                    line['ComponentType'])
     self.display_speech()
Exemple #22
0
def StripDebateHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading(
        'Initial', ih, headspeak
    )  # the 'Initial' is inserted by the splitheadingsspeakers function

    # volume type heading
    if re.search('THE$', headspeak[ih][0]):
        ih = StripDebateHeading('THE', ih, headspeak)
        ih = StripDebateHeading('PARLIAMENTARY(?:&nbsp;)+DEBATES', ih,
                                headspeak)
    elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]):
        ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak)
    if re.search('OFFICIAL REPORT', headspeak[ih][0]):
        ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak)
        ih = StripDebateHeading(
            'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak,
            True)
        ih = StripDebateHeading(
            'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih,
            headspeak, True)
        ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True)
        ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak,
                                True)
        ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih,
                                headspeak, True)
        ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak,
                                True)
        ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True)
        ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True)
        ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak)

    #House of Commons
    ih = StripDebateHeading('house of commons(?i)', ih, headspeak)

    # Tuesday 9 December 2003
    if not re.match('the house met at .*(?i)', headspeak[ih][0]):
        givendate = re.sub('&nbsp;', ' ', headspeak[ih][0])
        givendate = re.sub('</?i>', ' ', givendate)
        gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
        if gd:
            givendate = gd.group(1)
        if ((sdate !=
             mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
            raise Exception, 'date heading %s mismatches with date %s' % (repr(
                headspeak[ih]), sdate)
        ih = ih + 1

    gstarttime = None
    if sdate != "2001-06-13":
        #The House met at half-past Ten o'clock
        gstarttime = re.match(
            '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)',
            headspeak[ih][0])
        if (not gstarttime) or headspeak[ih][2]:
            raise ContextException(
                'non-conforming "the house met at" heading %s' %
                repr(headspeak[ih]), "")
        ih = ih + 1

# Start of a new parliament is special
    if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]:

        #PRAYERS
        ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

        ih = StripDebateHeading('pursuant to the Standing Order\.', ih,
                                headspeak, True)

        # in the chair
        ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih,
                                headspeak, True)

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)

    # set the time from the wording 'house met at' thing.
    if gstarttime:
        time = gstarttime.group(1)
        time = re.sub('</?i>', ' ', time)
        time = re.sub('\s+', ' ', time)
        if re.match("half-past Nine(?i)", time):
            newtime = '09:30:00'
        elif re.match("a quarter to Ten o(?i)", time):
            newtime = '09:45:00'
        elif re.match("Ten o'clock(?i)", time):
            newtime = '10:00:00'
        elif re.match("half-past Ten(?i)", time):
            newtime = '10:30:00'
        elif re.match("Eleven o&#039;clock(?i)", time):
            newtime = '11:00:00'
        elif re.match("twenty-five minutes past\s*Eleven(?i)", time):
            newtime = '11:25:00'
        elif re.match("twenty-six minutes past\s*Eleven(?i)", time):
            newtime = '11:26:00'
        elif re.match("twenty-nine minutes past\s*Eleven(?i)", time):
            newtime = '11:29:00'
        elif re.match("half-past Eleven(?i)", time):
            newtime = '11:30:00'
        elif re.match("Twelve noon(?i)", time):
            newtime = '12:00:00'
        elif re.match("half-past One(?i)", time):
            newtime = '13:30:00'
        elif re.match("half-past Two(?i)", time):
            newtime = '14:30:00'
        elif re.match("twenty minutes to Three(?i)", time):
            newtime = '14:40:00'
        elif re.match("10 minutes past Three(?i)", time):
            newtime = '15:10:00'
        elif re.match("Six o'clock(?i)", time):
            newtime = '18:00:00'
        else:
            raise ContextException, "Start time not known: " + time
        stampurl.timestamp = '<stamp time="%s"/>' % newtime

    for j in range(0, ih):
        stampurl.UpdateStampUrl(headspeak[j][1])

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)
Exemple #23
0
def SplitLordsText(text, sdate):
	res = [ '', '', '', '' ]

	# Use a name tags
	wagc = re.search('(?:<br>&nbsp;<br>\s*)?<a name\s*=\s*"(?:gc|column_(?:GC|CWH)\d+|[0-9\-]+_cmtee0)">(?:</a>)?(?i)', text)
	wams = re.search('(?:<br>&nbsp;<br>\s*)?<a name="(?:wms|column_WS\d+)">(?:</a>)?(?i)', text)
	wama = re.search('(?:<br>&nbsp;<br>\s*)?<a name="(?:column_WA\d+|[\dw]*_writ0)">(?:</a>)?(?i)', text)

	# the sections are always in the same order, but sometimes there's one missing.

	# set end of house of lords section and check order
	if wagc:
		holend = wagc.start(0)
		if wams:
			assert holend < wams.start(0)
		elif wama:
			assert holend < wama.start(0)
	elif wams:
		holend = wams.start(0)
		if wama:
			assert holend < wama.start(0)
	elif wama:
		holend = wama.start(0)
	else:
		holend = len(text)

	# set the grand committee end
	res[0] = text[:holend]
	if wagc:
		if wams:
			gcend = wams.start(0)
		elif wama:
			gcend = wama.start(0)
		else:
			gcend = len(text)
		res[1] = text[holend:gcend]
	else:
		gcend = holend

	# set the ministerial statements end
	if wams:
		if wama:
			msend = wama.start(0)
		else:
			msend = len(text)
		res[2] = text[gcend:msend]
	else:
		msend = gcend

	# set the written answers end
	maend = len(text)
	if wama:
		res[3] = text[msend:]

	# lords splitting
	if IsNotQuiet():
		print "Lords splitting into parts of size: ", map(len, res)

	# check the wrong column numbering or wrong titles aren't found in the wrong place
	assert res[0]  # there always is a main debate
	chns = re.search('<a name="column_\D+\d+">', res[0])
	if chns:
		print chns.group(0)
		raise ContextException("wrong column numbering in main debate", fragment=chns.group(0))

	# check that there is always an adjournment in the main debate, with some of the trash that gets put before it
	# this kind of overguessing is to get a feel for the variation that is encountered.
	if sdate not in ('2007-10-01', '2008-09-29', '2009-10-05', '2010-09-27', '2012-09-24', '2013-09-23') \
	    and not re.search('(?:<ul><ul><ul>|<ul><ul><p>|\s*(?:<ul>|<p>)?|<p>\s*<ul><ul>(?:<ul>)?)\s*(?:Parliament was prorogued|House adjourned |For the continuation of today\'s proceedings)(?i)', res[0]):
		raise ContextException("house adjourned failure", stamp=None, fragment=res[0][-100:])

        page = re.findall('<page[^>]*>', res[0])[-1]
        if (re.match('(<page[^>]*>\s*)+$', res[0])):
                res[0] = ''

	# check the title of the Grand Committee
	if res[1]:
                res[1] = page + res[1]
                page = re.findall('<page[^>]*>', res[1])[-1]
		assert not re.search('<a name="column_(?!(?:GC|CWH))\D+\d+">', res[1])
		if not re.search('<(?:h[23] align=)?"?center"?>(?:<a name="[^"]*">(?:</a>)?)?\s*(?:(?:Official Report of the )?(?:(?:the)?Northern Ireland Orders )?Grand Committee|Second Reading Committee)', res[1]):
			raise ContextException("grand committee title failure", stamp=None, fragment=res[1][:100])

	# check the title is in the Written Statements section
	if res[2]:
                res[2] = page + res[2]
                page = re.findall('<page[^>]*>', res[2])[-1]
		assert not re.search('<a name="column_(?!WS)\D+\d+">', res[2])
		assert re.search('center"?>(?:<a name="[^"]*">(?:</a>)?)?Written Statements?', res[2])

	# check the title and column numbering in the written answers
	if res[3]:
                res[3] = page + res[3]
		assert not re.search('<a name="column_(?!WA)\D+\d+">', res[3])
		if not re.search('<(?:h3 align=)?"?center"?>(?:<a name="[^"]*">(?:</a>)?)?\s*Written Answers?', res[3]): # sometimes the s is missing
			raise ContextException("missing written answer title", fragment=res[3])

	return res
Exemple #24
0
def CreateGIDs(gidpart, sdate, sdatever, flatb):
    pcolnum = "####"
    picolnum = -1
    ncid = -1
    colnumoffset = 0

    # the missing gid numbers come previous to the gid they would have gone, to handle missing ones before the 0
    # 0-1, 0-2, 0, 1, 2, 3-0, 3-1, 3, ...
    ncmissedgidrun = 0
    ncmissedgid = 0

    for qb in flatb:

        # construct the gid
        realcolnum = re.search('colnum="([^"]*)"', qb.sstampurl.stamp).group(1)

        # this updates any column number corrections that were appended on the end of the stamp
        for realcolnum in re.findall('parsemess-colnum="([^"]*)"',
                                     qb.sstampurl.stamp):
            pass

        # this is to do a mass change of column number when they've got out of sync with the GIDs
        # (normally due to Hansard's cm->vo transition)
        for colnumoffset in re.findall('parsemess-colnumoffset="([^"]*)"',
                                       qb.sstampurl.stamp):
            colnumoffset = string.atoi(colnumoffset)

        realcolnumbits = re.match('(\d+)([WS]*)$', realcolnum)
        irealcolnum = int(realcolnumbits.group(1))
        colnumN = irealcolnum + colnumoffset
        colnum = str(colnumN) + realcolnumbits.group(2)

        qb.ignorenamemismatch = re.search('parsemess-ignorenamemismatch="yes"',
                                          qb.sstampurl.stamp)

        # this numbers the speech numbers in the column numbers
        if colnum != pcolnum:
            # check that the column numbers are increasing
            # this is essential if the gids are to be unique.
            icolnum = string.atoi(re.match('(\d+)[WS]*$', colnum).group(1))
            if icolnum <= picolnum:
                print qb.sstampurl.stamp
                raise ContextException("non-increasing column numbers %s %d" %
                                       (colnum, picolnum),
                                       stamp=qb.sstampurl,
                                       fragment=colnum)
            picolnum = icolnum

            pcolnum = colnum
            ncid = 0
            ncmissedgidrun = 0
            ncmissedgid = 0
        else:
            ncid += 1

        # this executes the missing ncid numbering command
        bmissgid = False
        lsmissgid = re.findall('parsemess-missgid="([^"]*)"',
                               qb.sstampurl.stamp)
        for missgid in lsmissgid:
            if ncid == string.atoi(missgid):
                bmissgid = True

        if bmissgid:
            ncmissedgidrun += 1
            missedgidext = "-%d" % ncmissedgidrun
        else:
            ncmissedgidrun = 0
            missedgidext = ""

        # this is our GID !!!!
        qb.shortGID = '%s.%s.%d%s' % (sdatever, colnum, ncid - ncmissedgid,
                                      missedgidext)
        qb.GID = 'uk.org.publicwhip/%s/%s%s' % (gidpart, sdate, qb.shortGID)
        if bmissgid:
            ncmissedgid += 1

        # build the parallel set of GIDs for the paragraphs (in preparation for an upgrade)
        qb.stextptags = [
            ' pid="%s/%d"' % (qb.shortGID, i + 1) for i in range(len(qb.stext))
        ]

        # make a place to record the gidredirects which we obtain on the way through
        qb.gidredirect = []
Exemple #25
0
def MatchPWmotionStuff(qb, ispeechstartp1):
    qpara = qb.stext[ispeechstartp1]

    if re.match(
            '<p>(?:\[|<i>)*(?:Amendments?|Motion),? ?.{0,60}?(?:by leave)?,? withdrawn\.?,?(?:\]|</i>)*</p>(?i)',
            qpara):
        return "withdrawn"

    #[<i>Amendments Nos. 131 and 132 not moved.</i>]</p>
    notmovedMatch = re.match(
        '<p[^>]*>(?:\[|<i>)+Amendments? .{0,80}?(not moved|had been withdrawn from the Marshalled List|had been retabled as(?:Nos?\.|[^<\.\]]){0,60})(?:\.|</i>|\])+</p>(?i)',
        qpara)
    if notmovedMatch:
        return "notmoved"
    if re.match('<p>Motion not moved\.</p>', qpara):
        return "notmoved"
    if re.match(
            '<p>\[(?:<i>)?The Sitting was suspended .{0,60}?(?:</i>)?\](?:</i>)?</p>(?i)',
            qpara):
        return "suspended"
    if re.match('<p>\[(?:<i>)?The House observed.{0,60}?(\]|\.|</i>)+</p>',
                qpara):
        return "misc"
    if re.match(
            '<p>\[(?:<i>)?The page and line refer(?:ences are)? to .{0,160}?</p>',
            qpara):
        return "misc"

    # Needed to avoid lords on 2012-07-03 thinking this is someone withdrawing Amendment 63.
    if re.match('<p>Amendment 63 has been withdrawn, so I turn now to', qpara):
        return None
    # Needed to avoid lords on 2012-07-04 thinking this is someone withdrawing an Amendment rather than discussing it.
    if re.match(
            '<p>My Amendment 148G has been withdrawn from the Marshalled List,',
            qpara):
        return None

    if re.match(
            '<p>.{0,10}?(?:Amendment.{0,50}?|by leave, )(?<!semi-)withdrawn',
            qpara):
        raise ContextException(
            "Marginal withdrawn (fragment looks like it might be a withdrawn amendment, \nbut earlier regexp didn't pick it up)",
            stamp=qb.sstampurl,
            fragment=qpara)
    if re.match('<p>\s*\[<i>', qpara):
        raise ContextException(
            "Marginal notmoved (fragment looks like it might be an amendment not moved, \nbut an earlier regexp didn't pick it up)",
            stamp=qb.sstampurl,
            fragment=qpara)

    if re.match(
            '(?i)<p>(?:<i>)?(?:Moved.? accordingly,? and,? )?(?:[Oo]n [Qq]uestion,? )?(?:[Oo]riginal )?(?:[Mm]otion|[Aa]mendment|[Ss]chedule)s?(?: No\. \d+| [A-Z])?(?:, as amended)?,? agreed to(?:\.|&mdash;)+(?: Commons amendments?)?(?:</i>)?</p>',
            qpara):
        return "agreedto"
    clauseAgreedMatch = re.match(
        '<p>(?:(?:Clause|Schedule)s? \d+[A-Z]*,?(?:, \d+[A-Z]*)?(?: (?:and|to) \d+[A-Z]*)?|Title|Motion)(?:, as amended,?)? ((?:dis)?agreed to|negatived)\.</p>',
        qpara)
    if clauseAgreedMatch:
        return clauseAgreedMatch.group(
            1) == "agreed to" and "agreedto" or "negatived"
    clauseResolvedMatch = re.match(
        '<p>Resolved in the (negative|affirmative),? and (?:Motion(?: \w+)?|amendments?|the manuscript amendment|Clause \d+|Amendment .{5,60}?)(?:, as amended,)? (?:dis)?agreed to accordingly(?:\.?</p>|;)',
        qpara)
    if clauseResolvedMatch:
        return clauseResolvedMatch.group(
            1) == "negative" and "disagreedto" or "agreedto"
    if re.match('<p>Remaining( clauses?| and| schedules?)+ agreed to\.</p>',
                qpara):
        return "agreedto"
    commonsAmendMatch = re.match(
        '<p>(?:On Question, )?(?:manuscript )?(?:Commons )?Amendments? .{0,60}?(dis)?agreed to(?: accordingly)?\.</p>(?i)',
        qpara)
    if commonsAmendMatch:
        return commonsAmendMatch.group(1) and "disagreedto" or "agreedto"
    if re.match('<p>On Question, (?:Clause|Motion) .{0,16}?agreed to\.</p>',
                qpara):
        return "agreedto"
    if re.match('<p>Amendment disagreed to accordingly\.</p>', qpara):
        return "disagreedto"
    if re.match('<p>On Motion, Question agreed to\.</p>', qpara):
        return "agreedto"
    if re.match('<p>(The )?Schedule agreed to\.</p>', qpara):
        return "agreedto"

    if re.match('<p>Moved, That the .{0,120}? be (agreed to|approved)\.',
                qpara):
        return "considered"
    if re.match('<p>On Question, Whether .{0,60}? be agreed to\.', qpara):
        return "considered"
    if re.match(
            '<p>The Commons amendments were considered and agreed to\.</p>',
            qpara):
        return "agreeto"

    if re.match(
            '<p>(?:The )?Bill (?:was )?returned (?:earlier )?(?:from|to) the Commons.{0,350}?\.</p>',
            qpara):
        return "bill"
    if re.match(
            '<p[^>]*>The Commons (?:(?:do not )?insist on .{0,160}? but propose|have made the following consequential|(?:dis)?agree (?:to|with)) .{0,260}?(?:\.|&mdash;)*</p>',
            qpara):
        return "bill"
    if re.match(
            '<p[^>]*>The Lords insist on .{0,160}? for the following reasons?(?:\.|&mdash;)+</p>',
            qpara):
        return "bill"

    if re.match(
            '<p[^>]*>(?:<i>)?House adjourned (?:at|during) .{0,60}?(?:</i>)?</p>(?i)',
            qpara):
        return "adjourned"
    if re.match(
            '<p>(?:House|Debate|Second [Rr]eading debate|(?:Further )?[Cc]onsideration of amendments on Report) resumed(?: on Clause \d+)?[\.:]',
            qpara):
        return "resumed"

    if re.match("<p>A message was brought from the Commons", qpara):
        return "message"

    if re.match('<p>\*?Their Lordships divided:', qpara):
        return "divided"

    # this is the tag that can be used to give better titles on the motion text.
    if re.match(
            '<p>(?:Clause|Schedule) (?:\d+[A-Z]* )?\[(?:<i>)?.*?(?:</i>)?\]:</p>',
            qpara):
        return "considered"
    if re.match('<p>On Question, Whether ', qpara):
        return "considered"

    if re.match('<p>(?:Brought|Returned)(?: earlier)? from the Commons',
                qpara):
        return "misc"
    if re.match('<p>House (?:again )?in Committee', qpara):
        return "misc"
    if re.match('<p>\[\s*The (?:deputy )?chairman (?i)', qpara):
        #print "CHAIRMAN thing:", qpara
        return "misc"
    if re.match('<p>(?:Bill )?[Rr]ead a third time', qpara):
        return "bill"
    if re.match('<p>An amendment \(privilege\) made\.', qpara):
        return "misc"
    if re.match('<p>Report received\.', qpara):
        return "misc"

    if re.match('<p>Report received\.', qpara):
        return "misc"

    if re.match('<p>:TITLE3:', qpara):
        return "title"  # perhaps remove this keyword

# XXX MPS 2007-07-05 Don't care about this
    #if re.match("<p>.{0,20}?The noble[^:]{0,60}? said:", qpara):
    #	print re.match("(<p>The (?:noble(?: and (?:learned|gallant|right reverend))? (?:Lord|Baroness|Earl|Viscount|Countess|Duke)|right reverend Prelate|most reverend Primate) said:\s*)", qpara)
    #	#rens = re.match("(<p>The (?:noble(?: and (?:learned|gallant|right reverend))? (?:Lord|Baroness|Earl|Viscount|Countess|Duke)|right reverend Prelate|most reverend Primate) said:\s*)", qb.stext[i])
    #	print "Unexpected Noble Lord Said; are we missing the start of his speech where he moves the amendment?"
    #	print "False positives can be hidden by adding a space before the colon"
    #	print 'You can kill erroneous titles that are amendments by using <p class="tabletext">'
    #	raise ContextException("unexpected Noble Lord Said", stamp=qb.sstampurl, fragment=qpara)

    if re.match('<p>.{0,55}agreed to(?:\.| accordingly)', qpara):
        print "**********Marginal agreedto", qpara
        raise ContextException("Marginal agreed to",
                               stamp=qb.sstampurl,
                               fragment=qpara)

    return None
Exemple #26
0
def NormalHeadingPart(headingtxt, stampurl, state, typ):
    # This is an attempt at major heading detection.
    # The main wrap code spots adjournment debates, and does its best with some procedural things
    # But it's pretty flawed Also, Oral questions heading is a super-major heading,
    # so doesn't fit into the scheme.

    # remove junk italic settings that appear in the today pages
    headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt)

    # detect if this is a major heading and record it in the correct variable

    bmajorheading = False
    boralheading = False
    binsertedheading = False

    if re.search('-- lost heading --(?i)', headingtxt):
        binsertedheading = True

    # Oral question are really a major heading
    elif re.match("Oral Answers to Questions(?i)", headingtxt):
        boralheading = True
    # Check if there are any other spellings of "Oral Answers to Questions" with a loose match
    elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \
                           (not re.search("electoral", headingtxt)) and \
      stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it
        print headingtxt
        raise ContextException('Oral question match not precise enough',
                               stamp=stampurl,
                               fragment=headingtxt)

    # All upper case headings - UGH
    elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \
               ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)):
        bmajorheading = True

    elif 'just_had_points_of_order' in state:
        bmajorheading = True
        del state['just_had_points_of_order']

    # If this is labeled major, then it gets concatenated with the
    # subsequent major heading.  It's kind of a procedural info about the
    # running of things, so fair to have it as a minor heading alone.
    elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt):
        bmajorheading = False

    elif re.search("in\s*the\s*chair(?i)", headingtxt):
        print headingtxt
        raise ContextException('in the chair match not precise enough',
                               stamp=stampurl,
                               fragment=headingtxt)

    # Other major headings, marked by _head in their anchor tag
    elif re.search('"topichd_|"ordayhd_|"hd_|_head', stampurl.aname):
        bmajorheading = True

# Wah
    if stampurl.sdate > '2006-05-07':
        if re.match(
                "(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)",
                headingtxt):
            bmajorheading = True
        if re.match("Points? of Order(?i)", headingtxt):
            bmajorheading = True
            state['just_had_points_of_order'] = True
        if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt):
            bmajorheading = True
            state['remaining_private_bills'] = True

    # we're not writing a block for division headings
    # write out block for headings
    headingtxtfx = FixHTMLEntities(headingtxt)
    try:
        assert not re.search(
            "[<>]", headingtxtfx), headingtxtfx  # an assertion in gidmatching
    except AssertionError:
        raise ContextException('Tag found in heading text',
                               stamp=stampurl,
                               fragment=headingtxt)

    qb = qspeech('nospeaker="true"', headingtxtfx, stampurl)
    if typ == 'westminhall':
        qb.typ = 'minor-heading'
    elif binsertedheading:
        qb.typ = 'inserted-heading'
    elif boralheading:
        qb.typ = 'oral-heading'
    elif bmajorheading:
        qb.typ = 'major-heading'
    else:
        qb.typ = 'minor-heading'

    # headings become one unmarked paragraph of text
    qb.stext = [headingtxtfx]
    return qb
Exemple #27
0
def FilterReply(qs):
	# split into paragraphs.  The second results is a parallel array of bools
	(textp, textpindent) = SplitParaIndents(qs.text, qs.sstampurl)
	if not textp:
		raise Exception, ' no paragraphs in result '


	# the resulting list of paragraphs
	stext = []

	# index into the textp array as we consume it.
	i = 0

	# deal with holding answer phrase at front
	# <i>[holding answer 17 September 2003]:</i>
	qholdinganswer = resqbrack.match(textp[0])
	if qholdinganswer:
		pht = PhraseTokenize(qs, qholdinganswer.group(1))
		stext.append(pht.GetPara('holdinganswer'))
		textp[i] = textp[i][qholdinganswer.span(0)[1]:]
		if not textp[i]:
			i += 1


	# asked to reply
	qaskedtoreply = reaskedtoreply.match(textp[i])
	if qaskedtoreply:
		pht = PhraseTokenize(qs, qaskedtoreply.group(0))
		stext.append(pht.GetPara('askedtoreply'))
		textp[i] = textp[i][qaskedtoreply.span(0)[1]:]
		if not textp[i]:
			i = i+1


	# go through the rest of the paragraphs
	while i < len(textp):
		# deal with tables
		if re.match('<table(?i)', textp[i]):
			if re.match('<table[^>]*>[\s\S]*?</table>$(?i)', textp[i]):
				stext.extend(ParseTable(textp[i], qs.sstampurl))
				i += 1
				continue
			else:
				print "textp[i]: ", textp[i]
				raise ContextException("table start with no end", stamp=qs.sstampurl, fragment=textp[i])

		qletterinlibrary = reletterinlibrary.match(textp[i])
		if qletterinlibrary:
			pht = PhraseTokenize(qs, qletterinlibrary.group(0))
			stext.append(pht.GetPara('letterinlibrary'))
			textp[i] = textp[i][qletterinlibrary.span(0)[1]:]
			if not textp[i]:
				i += 1
			continue

		# <i>Letter from Ruth Kelly to Mr. Frank Field dated 2 December 2003:</i>
		# introducing a previous letter from a civil servant to an MP
		# this should tokenize the pieces more
		qlettfrom = relettfrom.match(textp[i])
		if qlettfrom:
			pht = PhraseTokenize(qs, qlettfrom.group(1))
			stext.append(pht.GetPara('letterfrom'))
			i += 1
			continue

		# nothing special about this paragraph (except it may be indented)
		pht = PhraseTokenize(qs, textp[i])
		stext.append(pht.GetPara(pcode[textpindent[i]], bKillqnum=True))
		i += 1

	return stext
Exemple #28
0
def FilterLordsSpeech(qb):

    # pull in the normal filtering that gets done on debate speeches
    # does the paragraph indents and tables.  Maybe should be inlined for lords
    FilterDebateSpeech(qb)

    # the colon attr is blank or has a : depending on what was there after the name that was matched
    ispeechstartp1 = 0  # plus 1

    # no colonattr or colon, must be making a speech
    recol = re.search('colon="(:?)"', qb.speaker)
    bSpeakerExists = not re.match('nospeaker="true"', qb.speaker)
    if bSpeakerExists and (not recol or recol.group(1)):
        # text of this kind at the beginning should not be spoken, assume there wasn't a colon
        if not re.search("<p>(?:moved|asked|rose to move,) (?i)",
                         qb.stext[0]) or re.search("<p>moved formally(?i)",
                                                   qb.stext[0]):
            ispeechstartp1 = 1  # 0th paragraph is speech text

    res = []  # output list
    preparagraphtype = ""
    if bSpeakerExists and (ispeechstartp1 == 0):
        if re.match(
                "<p>asked Her Majesty's Government|<p>asked the|<p>&mdash;Took the Oath",
                qb.stext[0]):
            preparagraphtype = "asked"
            ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)
            if ispeechstartp1 == len(
                    qb.stext):  # No Noble Lord said, the usual
                ispeechstartp1 = 1
            if ispeechstartp1 != 1:
                print "Noble Lord Said on ", ispeechstartp1, "paragraph"
                raise ContextException(
                    "Noble Lord Said missing in second paragraph",
                    stamp=qb.sstampurl)
            # ensure that the noble lord said doesn't say an amendment withdrawn
            assert not MatchPWmotionStuff(qb, ispeechstartp1)

        elif re.match("<p>rose to (?:ask|call|draw attention|consider)",
                      qb.stext[0]):
            preparagraphtype = "asked"
            ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)
            if ispeechstartp1 not in [1, 2]:
                print "Noble Lord Said on ", ispeechstartp1, "paragraph"
                raise ContextException(
                    "Noble Lord Said missing in second paragraph",
                    stamp=qb.sstampurl)

            # ensure that the noble lord said doesn't say an amendment withdrawn
            assert not MatchPWmotionStuff(qb, ispeechstartp1)

        # identify a writ of summons (single line)
        elif re.match(
                "<p>(?:[\s,]*having received a [Ww]rit of [Ss]ummons .*?)?[Tt]ook the [Oo]ath\.</p>$",
                qb.stext[0]):
            assert len(qb.stext) == 1
            qb.stext[0] = re.sub(
                '^<p>', '<p pwmotiontext="summons">', qb.stext[0]
            )  # cludgy; already have the <p>-tag embedded in the string
            res.append(qb)
            return res  # bail out

        elif re.search(
                "having been created.*?Was, in (his|her) robes, introduced",
                qb.stext[0]):
            assert len(qb.stext) == 1
            qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl)
            qbunspo.typ = 'speech'
            qbunspo.stext = qb.stext
            qbunspo.stext[0] = re.sub('^<p>', '<p pwmotiontext="introduced">',
                                      qbunspo.stext[0])
            res.append(qbunspo)
            return res

        elif re.match("<p>&mdash;Took the Oath", qb.stext[0]):
            assert False

        # identify a moved amendment
        elif re.match(
                "<p>moved,? |<p>Amendments? |<p>had given notice|<p>(?:rose )?to move|<p>had given his intention",
                qb.stext[0]):

            # find where the speech begins, and strip out "The noble lord said:"
            preparagraphtype = "moved"
            ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)

            # everything up to this point is non-speech
            assert ispeechstartp1 > 0
            qbprev = qspeech(qb.speaker, "", qb.sstampurl)
            qbprev.typ = 'speech'
            qbprev.stext = qb.stext[:ispeechstartp1]

            res.append(qbprev)
            if ispeechstartp1 == len(qb.stext):
                return res

            # upgrade the spoken part
            qb.speaker = string.replace(qb.speaker, 'colon=""', 'colon=":"')
            del qb.stext[:ispeechstartp1]
            assert qb.stext
            ispeechstartp1 = 1  # the spoken text must reach at least here (after the line, "The noble lord said:")

        # error, no moved amendment found
        else:
            print qb.stext
            print "no moved amendment; is a colon missing after the name?"
            raise ContextException("missing moved amendment",
                                   stamp=qb.sstampurl)

    # advance to place where non-speeches happen
    if ispeechstartp1 > len(qb.stext):
        print "ispeechstartp1 problem; speeches running through", ispeechstartp1, len(
            qb.stext)
        print qb.stext
        raise ContextException(
            "end of speech boundary unclear running through; need to separate paragraphs?",
            stamp=qb.sstampurl)

    # a common end of speech is to withdraw an amendment
    # we go through paragraphs until we match that or some other motion text type statement
    sAmendmentStatement = None
    while bSpeakerExists and (ispeechstartp1 < len(qb.stext)):
        sAmendmentStatement = MatchPWmotionStuff(qb, ispeechstartp1)
        if sAmendmentStatement:
            break

        ispeechstartp1 += 1

    # there are no further lines after the widthdrawal
    if ispeechstartp1 == len(qb.stext):
        assert not sAmendmentStatement
        res.append(qb)
        return res

    # do the further lines after withdrawal
    assert (not bSpeakerExists) or sAmendmentStatement

    # splice off the unspoken text running off from the amendment statements
    if ispeechstartp1 != 0:
        qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl)
        qbunspo.typ = 'speech'
        qbunspo.stext = qb.stext[ispeechstartp1:]
        del qb.stext[ispeechstartp1:]
        res.append(qb)
        res.append(qbunspo)
    else:
        res.append(qb)
        qbunspo = qb

    # check that once we begin pwmotion amendment statements, all statements are of this type
    for i in range(len(qbunspo.stext)):
        if not re.match('<p', qbunspo.stext[i]):
            continue
        sAmendmentStatement = MatchKnownAsPWmotionStuff(qbunspo, i)
        if not sAmendmentStatement:
            if IsNotQuiet():
                print "UNRECOGNIZED-MOTION-TEXT%s: %s" % (
                    bSpeakerExists and " " or "(*)", qbunspo.stext[i])
            sAmendmentStatement = "unrecognized"
        qbunspo.stext[i] = re.sub(
            '^<p(.*?)>', '<p\\1 pwmotiontext="%s">' % sAmendmentStatement,
            qbunspo.stext[i])

    return res
Exemple #29
0
def LordsFilterDivision(text, stampurl, sdate):

	# the intention is to splice out the known parts of the division
	fs = re.split('\s*(?:<br>|</?p>)\s*(?i)', text)

	contentlords = [ ]
	notcontentlords = [ ]
	contstate = ''

	for fss in fs:
		if not fss:
			continue
		cfs = recontma.match(fss)
		if cfs:
			if cfs.group(1) == "CONTENTS":
				assert contstate == ''
				contstate = 'content'
			elif cfs.group(1) == 'NOT-CONTENTS' or cfs.group(1) == 'NOT CONTENTS':
				assert contstate == 'content'
				contstate = 'not-content'
			else:
				print "$$$%s$$$" % cfs.group(1)
				raise ContextException("unrecognised content state", stamp=stampurl, fragment=fss)

		elif re.match("(?:\[\*|\*\[)[Ss]ee col\. \d+\]", fss):
			print "Disregarding cross-reference in Division", fss
		elif re.match("\[\*\s*The Tellers.*?[Tt]he Clerks.*?\]", fss):
			print "Disregarding clerk comment on numbers", fss
		elif re.match("\[\*\s*The name of a .*? removed from the voting lists\.\]", fss):
			print "Disregarding removed from list comment", fss

		else:
			if not contstate:
				raise ContextException("empty contstate", stamp=stampurl, fragment=fss)

			# split off teller case
			teller = retellma.match(fss)
			tels = ''
			lfss = fss
			if teller:
				lfss = teller.group(1)
				tels = ' teller="yes"'

			# strip out the office
			offm = reoffma.match(lfss)
			if offm:
				lfss = offm.group(1)
			if not lfss:
				raise ContextException("no name on line", stamp=stampurl, fragment=fss)
			lordid = lordsList.MatchRevName(lfss, sdate, stampurl)
			lordw = '\t<lord person_id="%s" vote="%s"%s>%s</lord>' % (lordid, contstate, tels, FixHTMLEntities(fss))

			if contstate == 'content':
				contentlords.append(lordw)
			else:
				notcontentlords.append(lordw)

	# now build up the return value
	stext = [ ]
	stext.append('<divisioncount content="%d" not-content="%d"/>' % (len(contentlords), len(notcontentlords)))
	stext.append('<lordlist vote="content">')
	stext.extend(contentlords)
	stext.append('</lordlist>')
	stext.append('<lordlist vote="not-content">')
	stext.extend(notcontentlords)
	stext.append('</lordlist>')

	return stext
Exemple #30
0
def MatchKnownAsPWmotionStuff(qb, ispeechstartp1):
    res = MatchPWmotionStuff(qb, ispeechstartp1)
    if res:
        return res
    qpara = qb.stext[ispeechstartp1]
    #if re.match("<p>My Lords", qpara):
    #	raise ContextException("My Lords in known amendment text", stamp=qb.sstampurl, fragment=qpara)

    if re.match("<p>.{0,60}? Act[\.,]?</p>", qpara):
        return "act"
    if re.match("<p[^>]*>\([d\w]+\) ", qpara):
        return "lines"
    if re.match("<p[^>]*>\( \) ", qpara):
        return "lines"
    if re.match("<p><phrase class=\"date\".*</phrase>\.</p>", qpara):
        return "date"

    if re.match("<p[^>]*>Sections? .{0,30}?</p>", qpara):
        return "lines"
    if re.match(
            "<p[^>]*>(?:Schedule \S+?|The Schedule)(?:, paragraph.{0,60}?)?</p>",
            qpara):
        return "lines"

    if re.match("<p[^>]*>\d+[A-Z]?\.? ", qpara):
        return "lines"
    if re.match("<p[^>]*>Page \d+, line \d+, ", qpara):
        return "lines"
    if re.match("<p[^>]*>&quot;", qpara):
        return "quot"
    if re.match(
            "<p>[a-z]", qpara
    ):  # starting with lower case letter, some kind of continuation
        return "quot"
    if re.match("<p[^>]*>&mdash;", qpara):
        return "lines"

    # insert an extra space because they tend to ram it together
    clpmatch = re.match(
        "(<p[^>]*>\d+[A-Z]?)((?:Clause|Line|Page|Schedule|Because|After|Insert) .*$)",
        qpara)
    if clpmatch:
        qb.stext[ispeechstartp1] = "%s %s" % (clpmatch.group(1),
                                              clpmatch.group(2))
        return "lines"

    if re.match(
            "<p>The noble .{0,30}?(?:Lord|Baroness|Earl|Viscount|Countess|Duke) said",
            qpara):
        print "*****", qpara
        raise ContextException("unexpected weak Noble Lord Said",
                               stamp=qb.sstampurl,
                               fragment=qpara)
    if re.match("<p>The .{5,40}? \([^)]+\):", qpara):
        raise ContextException("unexpected person with position Said",
                               stamp=qb.sstampurl,
                               fragment=qpara)
    if re.match(
            "<p>(?:Lord|Baroness|Earl|Viscount|Countess|Duke) [\w\s].{5,40}?:",
            qpara):
        raise ContextException("unexpected person Said, (missing <b>?)",
                               stamp=qb.sstampurl,
                               fragment=qpara)
    if re.match(
            "<p>(?:Lord|Baroness|Earl|Viscount|Countess|Duke) [\w\s].{5,40}? moved ",
            qpara):
        raise ContextException("unexpected person moved, (missing <b>?)",
                               stamp=qb.sstampurl,
                               fragment=qpara)

    return None