Python StampUrl Examples, splitheadingsspeakers.StampUrl Python Examples

Example #1

0

Show file

File: sections.py Project: emmaclarke/parlparse

def StripWestminhallHeadings(headspeak, sdate):
	# check and strip the first two headings in as much as they are there
	ih = 0
	ih = StripDebateHeading('Initial', ih, headspeak)

	# Westminster Hall
	ih = StripDebateHeading('westminster hall(?i)', ih, headspeak)

	# date line
        givendate = re.sub('</?i>',' ', headspeak[ih][0])
        gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
        if gd:
                givendate = gd.group(1)
	if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
		raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate)
	ih = ih + 1

	# next line is:
	# <H3><center>[Mr. John McWilliam in the Chair]</center></H3>
	# but we leave it as a title.

	# find the url, colnum and time stamps that occur before anything else in the unspoken text
	stampurl = StampUrl(sdate)
	stampurl.timestamp = '<stamp time="%s"/>' % "unknown"

	for j in range(0, ih):
		stampurl.UpdateStampUrl(headspeak[j][1])

	if (not stampurl.stamp) or (not stampurl.pageurl):
		raise Exception, ' missing stamp url at beginning of file '
	return (ih, stampurl)

Example #2

0

Show file

def StripWestminhallHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading('Initial', ih, headspeak)

    # Westminster Hall
    ih = StripDebateHeading('westminster hall(?i)', ih, headspeak)

    # date line
    givendate = re.sub('</?i>', ' ', headspeak[ih][0])
    gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
    if gd:
        givendate = gd.group(1)
    if ((sdate !=
         mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
        raise Exception, 'date heading %s mismatches with date %s' % (repr(
            headspeak[ih]), sdate)
    ih = ih + 1

    # next line is:
    # <H3><center>[Mr. John McWilliam in the Chair]</center></H3>
    # but we leave it as a title.

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)
    stampurl.timestamp = '<stamp time="%s"/>' % "unknown"

    for j in range(0, ih):
        stampurl.UpdateStampUrl(headspeak[j][1])

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)

Example #3

0

Show file

File: speakers.py Project: samknight/parlparse

def FilterWMSSpeakers(fout, text, sdate):
        stampurl = StampUrl(sdate)

        for fss in recomb.split(text):
                stampurl.UpdateStampUrl(fss)

                # speaker detection
                speakerg = respeakervals.match(fss)
                if speakerg:
                        anamestamp = speakerg.group(1) or speakerg.group(2) or ""
                        spstr = string.strip(speakerg.group(3))
                        spstrbrack = speakerg.group(4)
                        if not spstr:
                                continue
                        try:
                                #print "spstr", spstr, ",", spstrbrack
                                result = memberList.matchwmsname(spstr, spstrbrack, sdate)
                        except Exception, e:
                                raise ContextException(str(e), stamp=stampurl, fragment=fss)

                        # put record in thisplace
                        spxm = '%s<speaker %s>%s</speaker>\n' % (anamestamp, result.encode("latin-1"), spstr)
                        fout.write(spxm)
                        continue

                # nothing detected
                # check if we've missed anything obvious
                if recomb.match(fss):
                        raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
                if remarginal.search(fss):
                        raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

                fout.write(fss)

Example #4

0

Show file

File: sections.py Project: samknight/parlparse

def StripWransHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    i = 0
    if (headspeak[i][0] != 'Initial') or headspeak[i][2]:
        print headspeak[0]
        raise ContextException('non-conforming Initial heading ')
    i += 1

    # import pdb;pdb.set_trace()
    if (not re.match(
            '(?:<stamp aname="[^"]*"/>)*written answers?(?: to questions?)?(?i)',
            headspeak[i][0])) or headspeak[i][2]:
        if not re.match('The following answers were received.*',
                        headspeak[i][0]):
            pass
# print headspeak[i]
    else:
        i += 1

    givendate = string.replace(headspeak[i][0], "&nbsp;", " ")
    givendate = re.sub("</?i>", "", givendate)

    gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$', givendate)
    if gd:
        givendate = gd.group(1)
    if (not re.match('(?i)(?:<stamp[^>]*>)*(?:<i>)?\s*(?:The following answers were|Answers) received.*', headspeak[i][0]) and
           not re.match('(?:<stamp[^>]*>)?The following question was answered on.*', headspeak[i][0]) and \
     (sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[i][2]:
        if (not parlPhrases.wransmajorheadings.has_key(
                headspeak[i][0])) or headspeak[i][2]:
            print headspeak[i]
            raise ContextException('non-conforming second heading',
                                   stamp=None,
                                   fragment=headspeak[i][0])
    else:
        i += 1

    # find the url and colnum stamps that occur before anything else
    stampurl = StampUrl(sdate)
    for j in range(0, i):
        stampurl.UpdateStampUrl(headspeak[j][0])
        stampurl.UpdateStampUrl(headspeak[j][1])

# Later editions seem to miss first column number, sigh
    if not stampurl.stamp:
        for speeches in headspeak:
            text = ''.join([speech[1] for speech in speeches[2]])
            m = re.search('colnum="(\d+)W"', text)
            if m:
                stampurl.UpdateStampUrl('<stamp coldate="%s" colnum="%dW"/>' %
                                        (sdate, int(m.group(1)) - 1))
                break

    if not stampurl.stamp or not stampurl.pageurl or not stampurl.aname:
        raise ContextException('missing stamp url at beginning of file')
    return (i, stampurl)

Example #5

0

Show file

def StripLordsDebateHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading('Initial', ih, headspeak)

    # House of Lords
    ih = StripDebateHeading('house of lords(?i)', ih, headspeak, True)

    # Thursday, 18th December 2003.
    mdateheading = re.match('(?:<stamp aname="[^"]*"/>)*([\w\s\d,]*)\.?',
                            headspeak[ih][0])
    #time = TimeProcessing(timeg.group(1), previoustime, False, stampurl)
    #fout.write('<stamp time="%s"/>' % time)
    if not mdateheading or (sdate != mx.DateTime.DateTimeFrom(
            mdateheading.group(1)).date) or headspeak[ih][2]:
        print headspeak[ih]
        #raise ContextException('non-conforming date heading')  # recoverable?
    else:
        ih = ih + 1

    if re.match(
            '(?:<stamp aname="[^"]*"/>)*(?:THE )?(?i)QUEEN(?:\'|&....;)S SPEECH',
            headspeak[ih][0]):
        print headspeak[ih][0]
        print "QUEENS SPEECH"
        # don't advance, because this is the heading (works for 2005-05-17)

    elif re.match("Parliament", headspeak[ih][0]):
        print "parliamentparliament"
        # don't advance; this is a title (works for 2005-05-11)

    else:
        #<H4><center>Reassembling after the Christmas Recess, the House met at half-past two of the clock: The LORD CHANCELLOR on the Woolsack.</center></H4>
        # The House met at eleven of the clock (Prayers having been read earlier at the Judicial Sitting by the Lord Bishop of St Albans): The CHAIRMAN OF COMMITTEES on the Woolsack.
        ih = StripDebateHeading(
            '(?:reassembling.*?recess, )?the house (?:met|resumed)(?: for Judicial Business)? at ([^(]*)(?i)',
            ih, headspeak, True)
        #print starttime. (we should use the "Half past two" business in house met to set it, unfortunately the filtercoltime has already happened

        # Prayers&#151;Read by the Lord Bishop of Southwell.
        ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)
    #stampurl.timestamp = '<stamp( time="%s")/>', starttime)

    # set the time from the wording 'house met at' thing.
    for j in range(0, ih):
        stampurl.UpdateStampUrl(headspeak[j][1])

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)

Example #6

0

Show file

File: filterwmscolnum.py Project: JonathanBowker/parlparse

def FilterWMSColnum(fout, text, sdate):
	stamp = StampUrl(sdate) # for error messages

	colnum = -1
	for fss in recomb.split(text):
                #import pdb;pdb.set_trace()
		columng = recolumnumvals.match(fss)
		if columng:
			ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
			if sdate != ldate:
				raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp)

			lcolnum = string.atoi(columng.group(2))


			if (colnum == -1) or (lcolnum == colnum + 1):
				pass  # good
			elif lcolnum < colnum:
				raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp)
			colnum = lcolnum
			stamp.stamp = '<stamp coldate="%s" colnum="%sWS"/>' % (sdate, lcolnum)
			fout.write(' ')
			fout.write(stamp.stamp)
			continue

		columncontg = recolnumcontvals.match(fss)
		if columncontg:
			ldate = mx.DateTime.DateTimeFrom(columncontg.group(1)).date
			if sdate != ldate:
				raise ContextException("Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp)
			lcolnum = string.atoi(columncontg.group(2))
			if colnum != lcolnum:
				raise ContextException("Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp)

			continue

		# anchor names from HTML <a name="xxx">
		anameg = reanamevals.match(fss)
		if anameg:
			aname = anameg.group(1)
			stamp.aname = '<stamp aname="%s"/>' % aname
			fout.write(stamp.aname)
			continue

                # nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stamp)
		#if remarginal.search(fss):
		#	raise ContextException('marginal colnum detection case',
		#	        fragment=remarginal.search(fss).group(0),
		#		      stamp=stamp)
		fout.write(fss)

Example #7

0

Show file

File: speakers.py Project: samknight/parlparse

def LordsFilterSpeakers(fout, text, sdate):
	stampurl = StampUrl(sdate)

	officematches = {}

	# setup for scanning through the file.
	for fss in respeaker.split(text):

		# strip off the bolds tags
		# get rid of non-bold stuff
		bffs = respeakerb.match(fss)
		if not bffs:
			fout.write(fss)
			stampurl.UpdateStampUrl(fss)
			continue

		stampurl.UpdateStampUrl(fss)

		# grab a trailing colon if there is one
		fssb = bffs.group(1)
		if bffs.group(2):
			fssb = fssb + ":"

                # Remove the cruft
                fssb = re.sub('<stamp aname="[^"]*"/>', '', fssb)
                fssb = re.sub('</b><b>', '', fssb)

		# empty bold phrase
		if not re.search('\S', fssb):
			continue

		# division/contents/amendment which means this is not a speaker
		if renonspek.search(fssb):
			fout.write(fss)
			continue

		# part of quotes as an inserted title in an amendment
		if re.match('("|\[|&quot;)', fssb):
			fout.write(fss)
			continue

		# another title type (all caps), or a clause number
		if not re.search('[a-z]', fssb):
			fout.write(fss)
			continue

		# start piecing apart the name by office and leadout type
		namec = respeakervals.match(fssb)
		if not namec:
			print '*', fssb, '*'
			raise ContextException("bad format", stamp=stampurl, fragment=fssb)

		if namec.group('bracket'):
			name = re.sub('\s+', ' ', namec.group('bracket'))
			loffice = re.sub('\s+', ' ', namec.group('name'))
		else:
			name = re.sub('\s+', ' ', namec.group('name'))
			loffice = None

		colon = namec.group('colon')
		if not colon:
			colon = ""

		# get rid of some standard ones
		if re.match('the lord chancellor|noble lords|a noble lord|a noble baroness|the speaker(?i)', name):
			fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name))
			continue


		# map through any office information
		if loffice:
			if (not re.match("The (Deputy |Minister of State)", loffice)) and (loffice in officematches):
                                if sdate!='2014-09-26' and sdate!='2012-09-24' and officematches[loffice] != name:
                                        raise ContextException("office inconsistency, loffice: %s name: %s officematches: %s" % (loffice, name, officematches[loffice]), stamp=stampurl, fragment=fssb)
			else:
				officematches[loffice] = name
		elif name in officematches:
			loffice = name
			name = officematches[loffice]

		if regenericspeak.match(name):
			fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name))
			continue

		lsid = lordsList.GetLordIDfname(name, loffice=loffice, sdate=sdate, stampurl=stampurl)  # maybe throw the exception on the outside

                if not lsid:
                        fout.write('<speaker person_id="unknown" error="No match" speakername="%s" colon="%s">%s</speaker>' % (name, colon, name))
                else:
                        fout.write('<speaker person_id="%s" speakername="%s" colon="%s">%s</speaker>' % (lsid, name, colon, name))

                if namec.group('maiden'):
                        fout.write('<i>%s</i>' % namec.group('maiden'))

Example #8

0

Show file

File: filterdebatecoltime.py Project: scotm/parlparse

def FilterDebateColTime(fout, text, sdate, typ):
	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

	stamp = StampUrl(sdate) # for error messages
	btodaytype = re.match('<pagex [^>]*type="today"', text)
	if btodaytype:
		fout.write('<stamp colnum="000"/>\n')

	colnum = -1
	previoustime = []
	for fss in recomb.split(text):
		# column number type
		columng = recolumnumvals.match(fss)
		if columng:
			assert not btodaytype  # no columns in today

			# check date
			ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
			if sdate != ldate:
				raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stamp, fragment=fss)

			# check number
			lcolnum = string.atoi(columng.group(2))
			if lcolnum == colnum - 1:
				pass	# spurious decrementing of column number stamps
			elif (colnum == -1) or (lcolnum == colnum + 1):
				pass  # good
			# column numbers do get skipped during division listings
			elif lcolnum < colnum:
				raise ContextException("Colnum not incrementing %d smaller than %d -- %s" % (lcolnum, colnum, fss), stamp=stamp, fragment=fss)

			# write a column number stamp (has to increase no matter what)
			if lcolnum > colnum:
				colnum = lcolnum
				stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum)
			fout.write('<stamp coldate="%s" colnum="%s"/>' % (sdate, colnum))
			continue

		columncg = recolnumcontvals.match(fss)
		if columncg:
			ldate = mx.DateTime.DateTimeFrom(columncg.group(1)).date
			if sdate != ldate:
				raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stamp, fragment=fss)

			lcolnum = string.atoi(columncg.group(2))
			if colnum != lcolnum and sdate<'2006-05-08':
				raise ContextException("Cont column number disagrees %d -- %s" % (colnum, fss), stamp=stamp, fragment=fss)

			continue

		timeg = retimevals.match(fss)
		if timeg:
			time = TimeProcessing(timeg.group(1), previoustime, (timeg.group(0)[0] == '['), stamp)
			if not time:
				raise ContextException("Time not matched: " + timeg.group(1), stamp=stamp, fragment=fss)

			fout.write('<stamp time="%s"/>' % time)
			previoustime.append(time)
			continue

		# anchor names from HTML <a name="xxx">
		anameg = reanamevals.match(fss)
		if anameg:
                        aname = anameg.group(1)
                        stamp.aname = '<stamp aname="%s"/>' % aname
                        fout.write('<stamp aname="%s"/>' % aname)
                        continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			print "$$$", fss, "$$$"
			print regcolnumcont
			print re.match(regcolnumcont + "(?i)", fss)
			raise ContextException('regexpvals not general enough', stamp=stamp, fragment=fss)
		if remarginal.search(fss):
			print fss
			print '--------------------------------\n'
			print "marginal found: ", remarginal.search(fss).groups()
			print "zeroth: ", remarginal.search(fss).group(0)
			print '--------------------------------\n'
			raise ContextException('marginal coltime/a detection case', stamp=stamp, fragment=fss)
		fout.write(fss)

Example #9

0

Show file

File: sections.py Project: emmaclarke/parlparse

def StripDebateHeadings(headspeak, sdate):
	# check and strip the first two headings in as much as they are there
	ih = 0
	ih = StripDebateHeading('Initial', ih, headspeak)  # the 'Initial' is inserted by the splitheadingsspeakers function

	# volume type heading
	if re.search('THE$', headspeak[ih][0]):
		ih = StripDebateHeading('THE', ih, headspeak)
		ih = StripDebateHeading('PARLIAMENTARY(?:&nbsp;)+DEBATES', ih, headspeak)
	elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]):
		ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak)
	if re.search('OFFICIAL REPORT', headspeak[ih][0]):
		ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak)
		ih = StripDebateHeading('IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True)
		ih = StripDebateHeading('UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True)
		ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True)
		ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True)
		ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True)
                ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True)
		ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True)
		ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True)
		ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak)


	#House of Commons
	ih = StripDebateHeading('house of commons(?i)', ih, headspeak)

	# Tuesday 9 December 2003
	if not re.match('the house met at .*(?i)', headspeak[ih][0]):
                givendate = re.sub('&nbsp;',' ',headspeak[ih][0])
                givendate = re.sub('</?i>',' ', givendate)
                gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
                if gd:
                        givendate = gd.group(1)
		if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
			raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate)
		ih = ih + 1

        gstarttime = None
        if sdate != "2001-06-13":
                #The House met at half-past Ten o'clock
                gstarttime = re.match('(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0])
                if (not gstarttime) or headspeak[ih][2]:
                        raise ContextException('non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "")
                ih = ih + 1

        # Start of a new parliament is special
        if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]:

                #PRAYERS
                ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

                ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True)

                # in the chair
                ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True)

	# find the url, colnum and time stamps that occur before anything else in the unspoken text
	stampurl = StampUrl(sdate)

	# set the time from the wording 'house met at' thing.
        if gstarttime:
                time = gstarttime.group(1)
                time = re.sub('</?i>',' ', time)
                time = re.sub('\s+',' ', time)
                if re.match("half-past Nine(?i)", time):
                        newtime = '09:30:00'
                elif re.match("a quarter to Ten o(?i)", time):
                        newtime = '09:45:00'
                elif re.match("Ten o'clock(?i)", time):
                        newtime = '10:00:00'
                elif re.match("half-past Ten(?i)", time):
                        newtime = '10:30:00'
                elif re.match("Eleven o&#039;clock(?i)", time):
                        newtime = '11:00:00'
                elif re.match("twenty-five minutes past\s*Eleven(?i)", time):
                        newtime = '11:25:00'
                elif re.match("twenty-six minutes past\s*Eleven(?i)", time):
                        newtime = '11:26:00'
                elif re.match("twenty-nine minutes past\s*Eleven(?i)", time):
                        newtime = '11:29:00'
                elif re.match("half-past Eleven(?i)", time):
                        newtime = '11:30:00'
                elif re.match("Twelve noon(?i)", time):
                        newtime = '12:00:00'
                elif re.match("half-past One(?i)", time):
                        newtime = '13:30:00'
                elif re.match("half-past Two(?i)", time):
                        newtime = '14:30:00'
                elif re.match("twenty minutes to Three(?i)", time):
                        newtime = '14:40:00'
                elif re.match("10 minutes past Three(?i)", time):
                        newtime = '15:10:00'
                elif re.match("Six o'clock(?i)", time):
                        newtime = '18:00:00'
                else:
                        raise ContextException, "Start time not known: " + time
                stampurl.timestamp = '<stamp time="%s"/>' % newtime

	for j in range(0, ih):
		stampurl.UpdateStampUrl(headspeak[j][1])

	if (not stampurl.stamp) or (not stampurl.pageurl):
		raise Exception, ' missing stamp url at beginning of file '
	return (ih, stampurl)

Example #10

0

Show file

File: colnum.py Project: samknight/parlparse

def FilterWransColnum(fout, text, sdate):
    # Legacy individual substitution rules
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Remove junk
    text = text.replace("{**con**}{**/con**}", "")

    stamp = StampUrl(sdate)  # for error messages

    colnum = -1
    for fss in recomb.split(text):
        columng = recolumnumvals.match(fss)
        if columng:
            ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       fragment=fss,
                                       stamp=stamp)

            lcolnum = string.atoi(columng.group(2))
            if (colnum == -1) or (lcolnum == colnum + 1):
                pass  # good
            elif lcolnum < colnum:
                raise ContextException("Colnum not incrementing %d -- %s" %
                                       (lcolnum, fss),
                                       fragment=fss,
                                       stamp=stamp)
            # column numbers do get skipped during division listings

            colnum = lcolnum
            stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate,
                                                                  lcolnum)
            fout.write(' ')
            fout.write(stamp.stamp)
            continue

        columncontg = recolnumcontvals.match(fss)
        if columncontg:
            ldate = columncontg.group(1) or columncontg.group(3) or None
            lcolnum = columncontg.group(2) or columncontg.group(4) or None
            if ldate:
                ldate = mx.DateTime.DateTimeFrom(ldate).date
                if sdate != ldate:
                    raise ContextException(
                        "Cont column date disagrees %s -- %s" % (sdate, fss),
                        fragment=fss,
                        stamp=stamp)
                lcolnum = string.atoi(lcolnum)
                if colnum != lcolnum and sdate < '2006-05-08':
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" %
                        (colnum, fss),
                        fragment=fss,
                        stamp=stamp)

                # no need to output anything
                fout.write(' ')
                continue
            if columncontg.group(5):
                lcolnum = string.atoi(columncontg.group(5))
                if colnum != lcolnum and colnum != lcolnum + 1:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" %
                        (colnum, fss),
                        fragment=fss,
                        stamp=stamp)
                fout.write(' ')
                continue
            if columncontg.group(6):
                lcolnum = string.atoi(columncontg.group(6))
                if colnum + 1 != lcolnum:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" %
                        (colnum, fss),
                        fragment=fss,
                        stamp=stamp)
                colnum = lcolnum
                stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate,
                                                                      lcolnum)
                fout.write(' ')
                fout.write(stamp.stamp)
                continue

        # anchor names from HTML <a name="xxx">
        anameg = reanamevals.match(fss)
        if anameg:
            aname = anameg.group(1)
            stamp.aname = '<stamp aname="%s"/>' % aname
            fout.write(stamp.aname)
            continue

        # nothing detected
        # check if we've missed anything obvious
        if recomb.match(fss):
            raise ContextException('regexpvals not general enough',
                                   fragment=fss,
                                   stamp=stamp)
# Removed FAI 2007-05-25, I really don't care!
        #if remarginal.search(fss):
        #	raise ContextException('marginal colnum detection case',
        #	        fragment=remarginal.search(fss).group(0),
        #                stamp=stamp)

        fout.write(fss)

Example #11

0

Show file

File: coltime.py Project: samknight/parlparse

def FilterDebateColTime(fout, text, sdate, typ):
    # old style fixing (before patches existed)
    if typ == "debate":
        text = ApplyFixSubstitutions(text, sdate, fixsubs)

    stamp = StampUrl(sdate)  # for error messages
    btodaytype = re.match('<pagex [^>]*type="today"', text)
    if btodaytype:
        fout.write('<stamp colnum="000"/>\n')

    colnum = -1
    previoustime = []
    for fss in recomb.split(text):
        # column number type
        columng = recolumnumvals.match(fss)
        if columng:
            assert not btodaytype  # no columns in today

            # check date
            ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       stamp=stamp,
                                       fragment=fss)

            # check number
            lcolnum = string.atoi(columng.group(2))
            if lcolnum == colnum - 1:
                pass  # spurious decrementing of column number stamps
            elif (colnum == -1) or (lcolnum == colnum + 1):
                pass  # good
            # column numbers do get skipped during division listings
            elif lcolnum < colnum:
                raise ContextException(
                    "Colnum not incrementing %d smaller than %d -- %s" %
                    (lcolnum, colnum, fss),
                    stamp=stamp,
                    fragment=fss)

            # write a column number stamp (has to increase no matter what)
            if lcolnum > colnum:
                colnum = lcolnum
                stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate,
                                                                      lcolnum)
            fout.write('<stamp coldate="%s" colnum="%s"/>' % (sdate, colnum))
            continue

        columncg = recolnumcontvals.match(fss)
        if columncg:
            ldate = mx.DateTime.DateTimeFrom(columncg.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       stamp=stamp,
                                       fragment=fss)

            lcolnum = string.atoi(columncg.group(2))
            if colnum != lcolnum and sdate < '2006-05-08':
                raise ContextException(
                    "Cont column number disagrees %d -- %s" % (colnum, fss),
                    stamp=stamp,
                    fragment=fss)

            continue

        timeg = retimevals.match(fss)
        if timeg:
            time = TimeProcessing(timeg.group(1), previoustime,
                                  (timeg.group(0)[0] == '['), stamp)
            if not time:
                raise ContextException("Time not matched: " + timeg.group(1),
                                       stamp=stamp,
                                       fragment=fss)

            fout.write('<stamp time="%s"/>' % time)
            previoustime.append(time)
            continue

        # anchor names from HTML <a name="xxx">
        anameg = reanamevals.match(fss)
        if anameg:
            aname = anameg.group(1)
            stamp.aname = '<stamp aname="%s"/>' % aname
            fout.write('<stamp aname="%s"/>' % aname)
            continue

        # nothing detected
        # check if we've missed anything obvious
        if recomb.match(fss):
            print "$$$", fss, "$$$"
            print regcolnumcont
            print re.match(regcolnumcont + "(?i)", fss)
            raise ContextException('regexpvals not general enough',
                                   stamp=stamp,
                                   fragment=fss)
        if remarginal.search(fss):
            print fss
            print '--------------------------------\n'
            print "marginal found: ", remarginal.search(fss).groups()
            print "zeroth: ", remarginal.search(fss).group(0)
            print '--------------------------------\n'
            raise ContextException('marginal coltime/a detection case',
                                   stamp=stamp,
                                   fragment=fss)
        fout.write(fss)

Example #12

0

Show file

File: colnum.py Project: henare/parlparse

def FilterWransColnum(fout, text, sdate):
    # Legacy individual substitution rules
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Remove junk
    text = text.replace("{**con**}{**/con**}", "")

    stamp = StampUrl(sdate)  # for error messages

    colnum = -1
    for fss in recomb.split(text):
        columng = recolumnumvals.match(fss)
        if columng:
            ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp)

            lcolnum = string.atoi(columng.group(2))
            if (colnum == -1) or (lcolnum == colnum + 1):
                pass  # good
            elif lcolnum < colnum:
                raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp)
                # column numbers do get skipped during division listings

            colnum = lcolnum
            stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum)
            fout.write(" ")
            fout.write(stamp.stamp)
            continue

        columncontg = recolnumcontvals.match(fss)
        if columncontg:
            ldate = columncontg.group(1) or columncontg.group(3) or None
            lcolnum = columncontg.group(2) or columncontg.group(4) or None
            if ldate:
                ldate = mx.DateTime.DateTimeFrom(ldate).date
                if sdate != ldate:
                    raise ContextException(
                        "Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp
                    )
                lcolnum = string.atoi(lcolnum)
                if colnum != lcolnum and sdate < "2006-05-08":
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp
                    )

                    # no need to output anything
                fout.write(" ")
                continue
            if columncontg.group(5):
                lcolnum = string.atoi(columncontg.group(5))
                if colnum != lcolnum and colnum != lcolnum + 1:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp
                    )
                fout.write(" ")
                continue
            if columncontg.group(6):
                lcolnum = string.atoi(columncontg.group(6))
                if colnum + 1 != lcolnum:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp
                    )
                colnum = lcolnum
                stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum)
                fout.write(" ")
                fout.write(stamp.stamp)
                continue

        # anchor names from HTML <a name="xxx">
        anameg = reanamevals.match(fss)
        if anameg:
            aname = anameg.group(1)
            stamp.aname = '<stamp aname="%s"/>' % aname
            fout.write(stamp.aname)
            continue

            # nothing detected
            # check if we've missed anything obvious
        if recomb.match(fss):
            raise ContextException("regexpvals not general enough", fragment=fss, stamp=stamp)
            # Removed FAI 2007-05-25, I really don't care!
            # if remarginal.search(fss):
            # 	raise ContextException('marginal colnum detection case',
            # 	        fragment=remarginal.search(fss).group(0),
            #                stamp=stamp)

        fout.write(fss)

Example #13

0

Show file

def FilterWransSpeakers(fout, text, sdate):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Fix things like this, to put bold in. We use bold below to detect names, but
    # occasionally the reporters miss it out, and we catch such cases here:
    # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
    # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
    missingbolds = re.findall(
        '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)',
        text)
    for p1, p2, p3, p4 in missingbolds:
        missingbold = "%s%s%s%s" % (p1, p2, p3, p4)
        bold = "%s<b>%s%s</b>" % (p1, p3, p4)
        namematches = memberList.fullnametoids(p3, sdate)
        # Only fix if we found a matching name in the middle (and do it even if ambiguous)
        if namematches:
            #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
            if not missingbold in text:
                print "ERROR: missing bold text found, but then vanished when replacing"
            text = text.replace(missingbold, bold)
        #else:
        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

    # <B> Mrs. Iris Robinson: </B>
    lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
    ltableregexp = '<table[^>]*>[\s\S]*?</table>'  # these have bolds, so must be separated out
    tableregexp = ltableregexp + '(?i)'

    lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

    # setup for scanning through the file.
    fs = re.split(lregexp, text)

    # for error messages
    stampurl = StampUrl(sdate)

    for i in range(len(fs)):
        fss = fs[i]
        fss = stampurl.UpdateStampUrl(fss)  # Speakers have new stamps in them

        if re.match(tableregexp, fss):
            continue

        speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
        if not speakerg:
            continue

        # we have a string in bold
        boldnamestring = string.strip(speakerg[0][0])

        # trailing text after the colon in the bold speech bit
        if re.search('\S', speakerg[0][1]):
            fs[i + 1] = speakerg[0][1] + fs[i + 1]

        # push the square brackets outside of the boldstring if there is one
        # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
        sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
        if sqb:
            boldnamestring = string.strip(sqb[0][0])
            fs[i + 1] = sqb[0][1] + fs[i + 1]

        # get rid of blank bold strings
        if not re.search('\S', boldnamestring):
            fs[i] = ''
            continue

        # try to pull in the question number if preceding
        # These signify aborted oral questions, and are normally
        # useless and at the start of the page.
        # 27. <B> Mr. Steen: </B>
        if i > 0:
            oqnsep = re.findall(
                '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$',
                fs[i - 1])
            if oqnsep:
                fs[i - 1] = oqnsep[0][0] + oqnsep[0][2]
                boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

        # take out the initial digits and a dot which we may have just put in
        # (although sometimes it would have already been there)
        robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
        deci = None
        if robj:
            (deci, boldnamestring) = robj.groups()
            # TODO: do something with deci here (it is the "failed
            # oral questions" signifier)

        # see if it is an explicitly bad/ambiguous name which will never match
        if boldnamestring.find('<broken-name>') >= 0:
            person_id = 'unknown'
            boldnamestring = boldnamestring.replace('<broken-name>', '')
            remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (
                boldnamestring)
        else:
            # split bracketed cons out if present
            brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
            if brakmatch:
                (name, cons) = brakmatch.groups()
            else:
                (name, cons) = (boldnamestring, None)

            # match the member to a unique identifier
            (person_id, remadename,
             remadecons) = memberList.matchfullnamecons(name,
                                                        cons,
                                                        sdate,
                                                        alwaysmatchcons=False)
            if person_id and remadename:
                remadename = ' speakername="%s"' % (remadename)
            if not person_id:
                if remadename == "MultipleMatch":
                    if boldnamestring == 'Mr. Michael Foster':
                        if remadecons[0] == 'uk.org.publicwhip/person/10209':
                            person_id = remadecons[0]
                            remadename = ' speakername="Michael Foster"'
                            remadecons = 'Worcester'
                    else:
                        person_id = 'unknown'
                        remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
                elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
                    person_id = 'uk.org.publicwhip/person/10170'
                    remadename = ' speakername="Jim Dobbin"'
                else:
                    print "  No name,const match (%s,%s)" % (name, cons)
                    raise ContextException("No name match",
                                           stamp=stampurl,
                                           fragment=boldnamestring)

        # put record in this place
        fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \
          (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)

    # scan through everything and output it into the file
    fout.writelines(fs)

Example #14

0

Show file

def StripDebateHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading(
        'Initial', ih, headspeak
    )  # the 'Initial' is inserted by the splitheadingsspeakers function

    # volume type heading
    if re.search('THE$', headspeak[ih][0]):
        ih = StripDebateHeading('THE', ih, headspeak)
        ih = StripDebateHeading('PARLIAMENTARY(?:&nbsp;)+DEBATES', ih,
                                headspeak)
    elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]):
        ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak)
    if re.search('OFFICIAL REPORT', headspeak[ih][0]):
        ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak)
        ih = StripDebateHeading(
            'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak,
            True)
        ih = StripDebateHeading(
            'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih,
            headspeak, True)
        ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True)
        ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak,
                                True)
        ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih,
                                headspeak, True)
        ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak,
                                True)
        ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True)
        ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True)
        ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak)

    #House of Commons
    ih = StripDebateHeading('house of commons(?i)', ih, headspeak)

    # Tuesday 9 December 2003
    if not re.match('the house met at .*(?i)', headspeak[ih][0]):
        givendate = re.sub('&nbsp;', ' ', headspeak[ih][0])
        givendate = re.sub('</?i>', ' ', givendate)
        gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
        if gd:
            givendate = gd.group(1)
        if ((sdate !=
             mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
            raise Exception, 'date heading %s mismatches with date %s' % (repr(
                headspeak[ih]), sdate)
        ih = ih + 1

    gstarttime = None
    if sdate != "2001-06-13":
        #The House met at half-past Ten o'clock
        gstarttime = re.match(
            '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)',
            headspeak[ih][0])
        if (not gstarttime) or headspeak[ih][2]:
            raise ContextException(
                'non-conforming "the house met at" heading %s' %
                repr(headspeak[ih]), "")
        ih = ih + 1

# Start of a new parliament is special
    if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]:

        #PRAYERS
        ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

        ih = StripDebateHeading('pursuant to the Standing Order\.', ih,
                                headspeak, True)

        # in the chair
        ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih,
                                headspeak, True)

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)

    # set the time from the wording 'house met at' thing.
    if gstarttime:
        time = gstarttime.group(1)
        time = re.sub('</?i>', ' ', time)
        time = re.sub('\s+', ' ', time)
        if re.match("half-past Nine(?i)", time):
            newtime = '09:30:00'
        elif re.match("a quarter to Ten o(?i)", time):
            newtime = '09:45:00'
        elif re.match("Ten o'clock(?i)", time):
            newtime = '10:00:00'
        elif re.match("half-past Ten(?i)", time):
            newtime = '10:30:00'
        elif re.match("Eleven o&#039;clock(?i)", time):
            newtime = '11:00:00'
        elif re.match("twenty-five minutes past\s*Eleven(?i)", time):
            newtime = '11:25:00'
        elif re.match("twenty-six minutes past\s*Eleven(?i)", time):
            newtime = '11:26:00'
        elif re.match("twenty-nine minutes past\s*Eleven(?i)", time):
            newtime = '11:29:00'
        elif re.match("half-past Eleven(?i)", time):
            newtime = '11:30:00'
        elif re.match("Twelve noon(?i)", time):
            newtime = '12:00:00'
        elif re.match("half-past One(?i)", time):
            newtime = '13:30:00'
        elif re.match("half-past Two(?i)", time):
            newtime = '14:30:00'
        elif re.match("twenty minutes to Three(?i)", time):
            newtime = '14:40:00'
        elif re.match("10 minutes past Three(?i)", time):
            newtime = '15:10:00'
        elif re.match("Six o'clock(?i)", time):
            newtime = '18:00:00'
        else:
            raise ContextException, "Start time not known: " + time
        stampurl.timestamp = '<stamp time="%s"/>' % newtime

    for j in range(0, ih):
        stampurl.UpdateStampUrl(headspeak[j][1])

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)

Example #15

0

Show file

File: speakers.py Project: samknight/parlparse

def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
					
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
		stampurl.UpdateStampUrl(fss)
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss) or retabletext.match(fss):
			fout.write(fss.encode("latin-1"))
			continue

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":
			fout.write(fss.encode("latin-1"))
			continue

                if re.match('<b>(&#8220;)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss):
		        fout.write(fss)
                        continue

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
			else:
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
			try:
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)
			fout.write(spxm)
			continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
		fout.write(fss)

Example #16

0

Show file

def FilterLordsColtime(fout, text, sdate):
	colnum = -1
	time = ''

	stampurl = StampUrl(sdate)
	previoustime = []
	for fss in recomb.split(text):
		# column number type

		# we need some very elaboirate checking to sort out the sections, by
		# titles that are sometimes on the wrong side of the first column,
		# and by colnums that miss the GC code in that section.
		# column numbers are also missed during divisions, and this exception
		# should be detected and noted.

		# That implies that this is the filter which detects the boundaries
		# between the standard four sections.
		columng = recolumnumvals.match(fss)
		if columng:
			# check date
			ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
			if sdate != ldate:
				raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stampurl, fragment=fss)

			# check number
                        # ltype = columng.group(2)
			lcolnum = string.atoi(columng.group(3))
			if lcolnum == colnum - 1:
				pass	# spurious decrementing of column number stamps
			elif lcolnum == colnum:
				pass	# spurious repeat of column number stamps
			# good (we get skipped columns in divisions)
			elif (colnum == -1) or (colnum + 1 <= lcolnum <= colnum + 5):  # was 2 but this caused us to miss ones
				colnum = lcolnum
				fout.write('<stamp coldate="%s" colnum="%s%s"/>' % (sdate, colnum, ""))

			# column numbers do get skipped during division listings
			else:
				pass #print "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss)
				#raise Exception, "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss)

			#print (ldate, colnum, lindexstyle)
			continue

		timeg = retimevals.match(fss)
		if timeg:
			time = timeg.group(1)
			if not re.match('(?:</h5>|</st>)(?i)', time):
				time = TimeProcessing(time, previoustime, False, stampurl)
				fout.write('<stamp time="%s"/>' % time)
				if time:
                                        previoustime.append(time)
			continue

		# special lift a time out of the heading
		regtime3 = regtime3vals.match(fss)
		if regtime3:
			fout.write(fss) # put this heading back into the flow of text
			assert not previoustime
			lntimematch = re.match("(half[\- ]past )?(\w+)(-thirty)?$", regtime3.group(1))
			lnhour = lntimematch and lntimematch.group(2)
			# strange way to do it, but I'm keeping tab on examples, and the transition between am and pm
			if lnhour == "two":
				lntimep = "2:%s pm"
			elif lnhour == "three":
				lntimep = "3:%s pm"
			elif lnhour == "six":
				lntimep = "6:%s pm"
			elif lnhour == "nine":
				lntimep = "9:%s am"
			elif lnhour == "eleven":
				lntimep = "11:%s am"
			elif lnhour == "ten":
				lntimep = "10:%s am"
			else:
				print "-------------'%s'" % regtime3.group(1)
				assert False
			assert not lntimematch.group(1) or not lntimematch.group(3)
			ntime = lntimep % ((lntimematch.group(1) or lntimematch.group(3)) and "30" or "00")
			time = TimeProcessing(ntime, previoustime, False, stampurl)
			fout.write('<stamp time="%s"/>' % time)
			continue

		# anchor names from HTML <a name="xxx">
		anameg = reanamevals.match(fss)
		if anameg:
			aname = anameg.group(1)
			fout.write('<stamp aname="%s"/>' % aname)
			stampurl.aname = aname
			continue

		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			print "$$$", fss, "$$-$"
			raise ContextException(' regexpvals not general enough ', stamp=stampurl, fragment=fss) # a programming error between splitting and matching
		if remarginal.search(fss):
			print remarginal.search(fss).group(0)
			lregcolumnum6 = '<p>\s*</ul>\s*<a name="column_\d+">(?:</a>)?\s*<b>[^:<]*:\s*column\s*\d+\s*</b></p>\s*<ul><font size=3>(?i)'
			print re.findall(lregcolumnum6, fss)
			#print fss
			raise ContextException(' marginal coltime detection case ', stamp=stampurl, fragment=fss)
		fout.write(fss)

Example #17

0

Show file

File: colnum.py Project: samknight/parlparse

def FilterWMSColnum(fout, text, sdate):
    stamp = StampUrl(sdate)  # for error messages

    colnum = -1
    for fss in recomb.split(text):
        #import pdb;pdb.set_trace()
        columng = recolumnumvals.match(fss)
        if columng:
            ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       fragment=fss,
                                       stamp=stamp)

            lcolnum = string.atoi(columng.group(2))

            if (colnum == -1) or (lcolnum == colnum + 1):
                pass  # good
            elif lcolnum < colnum:
                raise ContextException("Colnum not incrementing %d -- %s" %
                                       (lcolnum, fss),
                                       fragment=fss,
                                       stamp=stamp)
            colnum = lcolnum
            stamp.stamp = '<stamp coldate="%s" colnum="%sWS"/>' % (sdate,
                                                                   lcolnum)
            fout.write(' ')
            fout.write(stamp.stamp)
            continue

        columncontg = recolnumcontvals.match(fss)
        if columncontg:
            ldate = mx.DateTime.DateTimeFrom(columncontg.group(1)).date
            if sdate != ldate:
                raise ContextException("Cont column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       fragment=fss,
                                       stamp=stamp)
            lcolnum = string.atoi(columncontg.group(2))
            if colnum != lcolnum:
                raise ContextException(
                    "Cont column number disagrees %d -- %s" % (colnum, fss),
                    fragment=fss,
                    stamp=stamp)

            continue

        # anchor names from HTML <a name="xxx">
        anameg = reanamevals.match(fss)
        if anameg:
            aname = anameg.group(1)
            stamp.aname = '<stamp aname="%s"/>' % aname
            fout.write(stamp.aname)
            continue

# nothing detected
        # check if we've missed anything obvious
        if recomb.match(fss):
            raise ContextException('regexpvals not general enough',
                                   fragment=fss,
                                   stamp=stamp)
        #if remarginal.search(fss):
        #	raise ContextException('marginal colnum detection case',
        #	        fragment=remarginal.search(fss).group(0),
        #		      stamp=stamp)
        fout.write(fss)