def DetectDidnotparticipate(self, gnv, vlabsent): adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext) mnotparticipate = re.match( "(.*?)\s*did not participate in the voting.", adtext) if mnotparticipate: assert len(vlabsent) == 1 if vlabsent[0] == self.i += 1 else: print "nonparcitipation name wrong", vlabsent[ 0], #if self.undocname != "S-PV-4305": raise unexception("mismatch nonparticipationvoting", self.tlcall[self.i].paranum) else: msubvote = re.match( "\[Subsequently.*? (Jamaica) .*? voted? in (favour)", adtext) if msubvote: nat = gnv[nat] = "%s-%s" % (gnv[nat], self.i += 1 elif len(vlabsent) != 0: if self.undocname not in [ "S-PV-3412", "S-PV-3413", "S-PV-3407", "S-PV-3409" ]: # cases where Rwanda is absent raise unexception("unaccounted nonparticipationvoting", self.tlcall[self.i].paranum)
def DetectSubsequentVoteChange(self, gnv): adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext) msubseq = re.match("\[Subsequently,? (.*?)\.?\]", adtext) self.votechanges = {} if not msubseq: if"Subsequently", adtext): print adtext raise unexception("unexpected subsequently", self.tlcall[self.i].paranum) return for sadtext in re.split(";\s*", if not sadtext: continue msadtext = re.match( "the delegations? of (.*?) (?:informed|advised) the [Ss]ecretariat that (?:it|they) (?:had )?intended to (vote in favour|vote against|abstain)$", sadtext) if not msadtext: msadtext = re.match( "the delegations? of (.*?)(?:(?:informed|advised) the Secretariat that (?:it|they))? had (not) intended to participate(?: in the voting)?$", sadtext) if not msadtext: msadtext = re.match( "the delegations? of (.*?) had intended to (vote in favour|vote against|abstain)$", sadtext) if not msadtext: print "---%s---" % sadtext #print re.match("the delegations? of (.*?) (?:informed|advised) the Secretariat that (?:it|they) had", sadtext) raise unexception("change vote advice unrecognized", self.tlcall[self.i].paranum) mess, natlist, carryforward = self.DetectNationList(, "ANDLIST", self.tlcall[self.i].paranum) assert natlist #print sadtext, #print natlist, "(", assert not carryforward for nat in natlist: assert nat not in self.votechanges if"favour", vch = "favour" elif"against", vch = "against" elif"abstain", vch = "abstain" elif"not", vch = "absent" else: assert False self.votechanges[nat] = vch self.votechange = adtext self.i += 1 for nat in self.votechanges: gnv[nat] = "%s-%s" % (gnv[nat], self.votechanges[nat])
def ExtractDateTime(self, txline, ltext): # extract the date out if poss mdate = re.match( "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?\s*m\.?| noon\.?)?(?: \(closed\))?$", ltext) if not mdate: #Tuesday, 3 December 2002, 10 a.m. if"Friday", ltext) and IsNotQuiet(): print ltext, re.match( "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?m\.?| noon\.?)?(?: \(closed\))?", ltext) return #print txlines[ih].ltext iday = int( if not in months: if IsNotQuiet(): print, months raise unexception( "unrecognized month", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) imonth = months.index( syear = if not re.match("(?:20\d\d|19\d\d)$", syear): raise unexception( "bad year", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) ihour = int( imin = and int( or 0 if and == "a" and ihour == 12: ihour = 0 elif and == "p" and ihour != 12: ihour += 12 if raise unexception( "date redefined", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) if not (0 <= ihour <= 23) or not (0 <= imin <= 59): if IsNotQuiet(): print ltext raise unexception( "bad time", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) = "%s-%02d-%02d %02d:%02d" % (syear, imonth + 1, iday, ihour, imin)
def DetectNationList(self, ptext, fromlast, paranum): bforce = (not not fromlast) if fromlast and fromlast not in ["FIRST", "ANDLIST"]: print "carryingforward $%s$" % fromlast ptext = "%s %s" % (fromlast, ptext) if ptext == "None": return "presentcomplete", [], None ptext = re.sub("</?i>", "", ptext) votelist = [ c.strip() for c in re.split("[,\.]", ptext) if not re.match("\s*$", c) ] if fromlast == "ANDLIST": #print "vvvv", votelist assert votelist if not FixNationName(votelist[-1], self.sdate): mand ="(.*?) and (.*)$", votelist[-1]) if mand: votelist[-1] = votelist.append( if re.match("<i>", ptext): if bforce: print fromlast, bforce, ptext assert False return "nothingmore", -1, -1 if votelist and not FixNationName(votelist[-1], self.sdate) and ( ptext[-1] != ",") and fromlast != "ANDLIST": carryforward = votelist[-1] votelist = votelist[:-1] else: carryforward = None res = [] fres = [] for lnation in votelist: nation = FixNationName(lnation, self.sdate) if not nation and fromlast == "ANDLIST" and re.match( "[Tt]he ", lnation): nation = FixNationName(lnation[4:], self.sdate) if nation: if nation != "INVALID": res.append(nation) else: fres.append(lnation) if bforce and fres: print votelist print "****", fres print "cccccc", carryforward raise unexception("votelist problem", self.tlcall[self.i].paranum) if res and not fres: return "present", res, carryforward if bforce: assert not fres return "presentblank", res, "" # the "In favour" is followed by a new page #print fres return "nothingmore", -1, -1
def ExtractDotLineChair(self, txlines, ih): assert self.pageno == 1 #<text top="334" left="185" width="584" height="17" font="2">Mr. Kavan . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . (Czech Republic)</text> while True: #print "------" + txlines[ih].ltext mchair ="([^>:]*?)\s*\. \. \. \. \.", txlines[ih].ltext) if mchair: break # fix missing year date #if self.undocname == "A-55-PV.44" and txlines[ih].ltext == "Monday, 30 October, 10 a.m.": # txlines[ih].ltext = "Monday, 30 October 2000, 10 a.m." self.ExtractDateTime(txlines[ih], txlines[ih].ltext) ih += 1 if ih == len(txlines): return -1 if not if IsNotQuiet(): for i in range(ih): print "--%s--" % txlines[i].ltext raise unexception( "dotlinechair date problem", paranumC(txlines[ih].undocname, None, 0, -1, txlines[ih].textcountnumber)) assert False # when country name for the president . . . . is not on same line mcountry ="\((.*?)\)$", txlines[ih].ltext) if not mcountry: ih += 1 #print txlines[ih].ltext mcountry = re.match("\((.*?)\)$", txlines[ih].ltext) if not mcountry: if IsNotQuiet(): print txlines[ih].ltext raise unexception( "unable to extract country from ...-line", paranumC(txlines[ih].undocname, None, 0, -1, txlines[ih].textcountnumber)) ih += 1 chairname = re.sub("\s\s+", " ", self.chairs.append( (chairname, FixNationName(, return ih
def DetectAdoption(self): adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext) madtext = "(adopted|carried|retained.*?|rejected)(?:, as amended,|, as a whole,)?\s+by(?: votes)?\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?", adtext) if not madtext: madtext = re.match( "(By)\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?", adtext) if not madtext: print "--%s-- %d" % (adtext, self.i) raise unexception("by votes problem", self.tlcall[self.i].paranum) ifavour = int( iagainst = ( != "none" and int( or 0) #if == "rejected": # i = ifavour; ifavour = iagainst; iagainst = i iabstain = ( and int( or 0) if == "rejected": il = (iagainst, ifavour, iabstain) else: il = (ifavour, iagainst, iabstain) ivl = (len(self.vlfavour), len(self.vlagainst), len(self.vlabstain)) if il != ivl: if IsNotQuiet(): print "wrong-count", self.undocname, il, ivl # wrong values are found on A-57-PV.73 s(favour=154, 152) if self.undocname not in [ "A-56-PV.82", "A-57-PV.73", "A-58-PV.54", "A-52-PV.69", "A-50-PV.90", "A-49-PV.83", ]: raise unexception("wrong votecount", self.tlcall[self.i].paranum) self.motiontext = MarkupLinks(adtext, self.undocname, self.paranum) self.i += 1
def DetectAgendaForm(ptext, genasssess, prevagendanum, paranum): if re.match("Agenda(?: items?)? \d+(?i)", ptext): blinepara = "boldline-agenda" acptext = re.sub( "(?:<i>\s*\(|\(\s*<i>|\()\s*(?:continued|resumed)\s*(?:\)\s*</i>|</i>\s*\)|\))|<i>\s*</i>", " ", ptext).strip() acptext = re.sub("\(\w\)|;.*$", "", acptext) acptext = re.sub("agenda items?(?i)", " ", acptext) acptext = re.sub("and", ", ", acptext) if not re.match("[\d\s,]+$", acptext): print ptext raise unexception("malformed boldline agenda", paranum) res = ",".join( ["%s-%s" % (aa, genasssess) for aa in re.findall("\d+", acptext)]) assert res return res mprovag = re.match( "Items? (\d+)(?: and (\d+)?)?(?: \(\w\))? of the provisional agenda", ptext) if mprovag: res = "%sp-%s" % (, genasssess) if res = "%s,%sp-%s" % (res,, genasssess) return res mreqreopen = re.match("Request for the reopening.*?agenda item (\d+)", ptext) if mreqreopen: return "%s-%s" % (, genasssess) if re.match("\(\w\)", ptext): if not re.match("\d+-\d+", prevagendanum): print "can't copy from prevagendanum", prevagendanum return "" assert prevagendanum.split("-")[1] == genasssess #print "\n\n\ncontinuingagendanum", prevagendanum, ptext return prevagendanum for agt, reagt in AgendaTypeMap: if, ptext): if agt == "condolence": print "NNNN", ptext return "%s-%s" % (agt, genasssess) print "\n\n**** ", ptext print genasssess #assert not"Agenda", ptext), ptext return ""
def CleanupTags(ptext, typ, paranum): assert typ in [ "council-agenda", "italicline", "italicline-tookchair", "italicline-spokein", "boldline", "spoken" ] if typ == "boldline": ptext = re.sub("</?b>", "", ptext).strip() if"<b>", ptext): ptext = re.sub("<b>([.,]\s*)</b>", "\\1", ptext) # slipt in a cleaning substitution here (can't find a better place for now) #ptext = re.sub("[u'`\u017d']", "'", ptext) # this one doesn't work ptext = re.sub(u'[\xad]', "-", ptext) # some very invisibley different symbol # could have a special paragraph type for this mspokein = re.match( "\((spoke in \w+(.*?delegation|President's Office)?)\)$", ptext) if mspokein: stext = re.sub("<[/ib]*>", "", return "<i>%s</i>" % stext if"<[^/i]+>", ptext): print ptext raise unexception("tag other than italics in text", paranum) if re.match("<i>.*?</i>[\s\.\-]?$", ptext): print ptext raise unexception("total italics in text", paranum) if"</?i>", "".join(re.split("<i>(.*?)</i>", ptext))): print ptext raise unexception("unmatched italics in spoken text", paranum) if"\s\S\s\S\s\S\s", ptext): print ptext raise unexception("probable gaps in text", paranum) return ptext
def DetectVote(self, votere): tlc = self.tlcall[self.i] votem = re.match(votere, tlc.paratext) if not votem: # missing abstain column case bAftervote = re.match( "<i>\s*(?:The )?[Dd]raft|<b>\s*The President|<i>\s*Operative paragraph|<i>.*?did not participate", tlc.paratext) if bAftervote and"Abstain", votere): # and self.undocname in ["A-53-PV.81", "A-55-PV.103", "A-55-PV.83", "A-55-PV.86", "A-56-PV.105", "A-56-PV.68", "A-56-PV.82", "A-56-PV.86", "A-57-PV.57", "A-57-PV.66", "A-57-PV.77", "A-58-PV.55", "A-58-PV.72"]: return [] if bAftervote and"Against", votere) and self.bSecurityCouncil: return [] if"Against|Abstaining", votere) and "Subsequently", tlc.paratext) and self.bSecurityCouncil: return [] if self.undocname in ["A-55-PV.44"] and "Against", votere) and re.match("<i>Abstaining", tlc.paratext): return [] if"Against", votere) and re.match( "<i>Abstaining:?</i>", tlc.paratext): return [] print "failed with:", votere, tlc.paratext raise unexception("votelist detectvote match", tlc.paranum) #print tlc.paratext mess, natlist, carryforward = self.DetectNationList( tlc.paratext[votem.end(0):].strip(), "FIRST", tlc.paranum) assert mess != "nothingmore" self.i += 1 # deal with nation names merging across pages. while True: mess, cnatlist, carryforward = self.DetectNationList( self.tlcall[self.i].paratext, carryforward, self.tlcall[self.i].paranum) if mess == "nothingmore": #print self.tlcall[self.i].paratext break natlist.extend(cnatlist) self.i += 1 return natlist
def __init__(self, txline, lundocname, lpageno, textcountnumber): mxline = re.match( '<text top="(\d+)" left="(\d+)" width="-?(\d+)" height="(\d+)" font="(\d+)">(.*?)</text>', txline) if not mxline: print txline, "tttttt" = int( self.left = int( self.width = int( self.height = int( self.font = int( self.pageno = lpageno self.undocname = lundocname self.textcountnumber = textcountnumber self.ltext = self.ltext = re.sub("<i>\s*</i>|<b>\s*</b>", " ", self.ltext) if re.match("<[ib]>\s*</[ib]>|\s*$", self.ltext): self.ltext = "" # will be removed if not self.ltext: return self.bfootertype = (self.left < 459 and self.left + self.width > 459) or re.match( footertext, self.ltext) #if self.bfootertype: # print self.ltext # move on any short bits that are like 13^(th) if self.height == 11 and not self.bfootertype and self.width <= 10: #print self.left, self.width, "'%s'" % self.ltext assert self.width <= 10 if self.ltext not in ["th", "rd", "st", "nd"]: if IsNotQuiet(): print self.ltext raise unexception( "unrecognized shortbit", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) += 2 # push the step down from 16 to 18
def AppendToCluster(txlcol, txl): # frig the indentation on the most common mistakes if re.match( "<i>The meeting (?:was called|was suspended|rose at|was resumed)", txl.ltext) and (txl.indent == 0): txl.indent = 31 if not txlcol: txlcol.append(TextLineCluster(txl)) return txl.vgap = - txlcol[-1].txls[-1].top #print txlcol[-1].txls[-1].ltext #print txl.vgap, txl.width, txl.height,, txl.ltext # zzzz # frig vgaps in some cases where the spacing was wider than normal if txl.undocname in ["A-50-PV.84", "A-50-PV.88"]: if txl.vgap == 21 or txl.vgap == 22: txl.vgap = 18 if txl.vgap == 42: txl.vgap = 43 if txl.undocname == "S-PV-5584": if txl.vgap == 20: txl.vgap = 19 if not txl.vgap in familiarvgaps: if IsNotQuiet(): print "\n\n vgap=", txl.vgap, "\n\nwidth/height/top", txl.width, txl.height,, txl.ltext # zzzz print " familiar vgaps:", familiarvgaps raise unexception( "vgap not familiar", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) if txl.vgap in (0, 17, 18, 19) or txl.vgap == 0: txlcol[-1].AddLine(txl) else: #print txl.vgap, "vvvv", txl.ltext txlcol.append(TextLineCluster(txl))
def __init__(self, tlcall, i, lundocname, lsdate, seccouncilmembers): self.tlcall = tlcall self.i = i self.sdate = lsdate self.undocname = lundocname self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname) self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname) assert self.bGeneralAssembly or self.bSecurityCouncil if not self.bSecurityCouncil: seccouncilmembers = None self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip() if self.bGeneralAssembly and re.match( "A recorded vote has been requested(?: for this item| on (?:the|this) motion|\. We shall now begin the voting process)?\.?$", vtext): self.i += 1 vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip() if self.bGeneralAssembly and re.match( "A recorded vote was taken\s*\.?$", vtext): self.i += 1 if self.bSecurityCouncil and re.match( "A vote was taken(?: by (?:a )?show of hands)?.$", vtext): self.i += 1 if not (self.i != i or self.undocname in ["A-55-PV.86", "A-50-PV.90", "A-49-PV.90"]): print "--%s--" % tlcall[self.i - 1].paratext if not re.match("<i>", tlcall[self.i - 1].paratext): print " --[should this line be italic?]" print tlcall[self.i].paratext raise unexception("requested vote not followed through", tlcall[self.i].paranum) self.vlfavour = self.DetectVote("<i>In favour:?\s*</i>:?") self.vlagainst = self.DetectVote("(?:<i>)?Against:?\s*(?:</i>)?:?") self.vlabstain = self.DetectVote("(?:<i>)?Abstaining:?(?:</i>)?:?") gnv, self.vlabsent = GenerateNationsVoteList(self.vlfavour, self.vlagainst, self.vlabstain, self.sdate, self.paranum, seccouncilmembers) self.votecount = "favour=%d against=%d abstain=%d absent=%d" % (len( self.vlfavour), len(self.vlagainst), len( self.vlabstain), len(self.vlabsent)) if IsNotQuiet(): print " ", self.votecount if self.bGeneralAssembly: self.DetectAdoption() self.DetectSubsequentVoteChange(gnv) if self.bSecurityCouncil: self.motiontext = "" self.DetectDidnotparticipate(gnv, self.vlabsent) #res = [ '\t\t<div style="border:1px solid black; margin-left:2em"><b>VOTE ', votecount, "</b><br>\n", "\t\t<i>", self.motiontext, "</i>\n" ] #res.append('\t\t<div style="font-size:6">') lvotelist = [] for nation, vote in sorted(gnv.items()): lvotelist.append('<span class="%s">%s</span>' % (vote, nation)) self.votelist = ", ".join(lvotelist) #res.append("</div></div>\n") #self.parafout = "".join(res) self.typ = "vote"
def ExtractSeccounFrontPage(self, txlines): = None self.chairs = [] self.seccouncilmembers = [] self.agenda = [] lasttop = -1 jtxlines = [] ih = 0 while ih < len(txlines): if txlines[ih].top == lasttop: jtxlines[-1] = "%s %s" % (jtxlines[-1], txlines[ih].ltext) else: jtxlines.append(txlines[ih].ltext) lasttop = txlines[ih].top ih += 1 del txlines # just deletes the reference to this object ih = 0 while ih < len(jtxlines): self.ExtractDateTime(None, jtxlines[ih]) mpresseat = re.match( "<i>(President|Chairman|later)(?:</i>:|:\s*</i>)\s*((?:Mr.|Mrs.|Ms.|Sir\.?|Miss|Sheikh|Baroness|Lord|Nana) .*?)\s+\.(?: \.)*\s*(\(.*)?$", jtxlines[ih]) #print jtxlines[ih], mpresseat if mpresseat: if not if IsNotQuiet(): for i in range(ih): print jtxlines[i] raise unexception( "missingg date", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) if in ["President", "Chairman"]: assert len(self.chairs) == 0 # first one else: assert len(self.chairs) == 1 # later president ih += 1 if scountry = else: scountry = "" if"\(", scountry) and not"\)", scountry): scountry = "%s %s" % (scountry, jtxlines[ih]) ih += 1 mcountry = re.match("\((.*?)\)$", scountry) lfscountry = re.sub("\s+", " ", fscountry = FixNationName(lfscountry, if not fscountry: if IsNotQuiet(): print "--%s--" % raise unexception( "unrecognized nationA", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) chairname = re.sub("\s\s+", " ", self.chairs.append((chairname, fscountry, "president")) if fscountry in self.seccouncilmembers: assert len(self.seccouncilmembers) == 1 assert fscountry == "New Zealand" assert self.undocname == "S-PV-3370" assert len(self.chairs) == 2 del self.chairs[0] del self.seccouncilmembers[0] self.seccouncilmembers.append(fscountry) continue mcountryseat = re.match( "(<i>Members(?:</i>:|:\s*</i>))?\s*([\w\-\s]*?)\s*\.(?: \.)*\s*((?:Mr.|Ms.|Mrs.|Miss|Dr.|Sir\.?|Sheikh|Baroness|Lord|Nana) [^<>]*|absent)$", jtxlines[ih]) if mcountryseat: if if len(self.chairs) not in [ 1, 2 ]: # in case of second president if IsNotQuiet(): print self.chairs, "chchchch" raise unexception( "chairs not thereB", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) else: if len(self.chairs) == 0: if not # prob a closed meeting break if IsNotQuiet(): print ih, jtxlines[ih] raise unexception( "seat without chair", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) lfscountry = re.sub("\s+", " ", fscountry = FixNationName(lfscountry, if not fscountry: if IsNotQuiet(): print "--%s--" % raise unexception( "unrecognized nationB", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) chairname = re.sub("\s\s+", " ", self.chairs.append((chairname, fscountry, "member")) if fscountry not in self.seccouncilmembers: self.seccouncilmembers.append(fscountry) else: if IsNotQuiet(): print "Repeat-country on council", fscountry else: if" \. \. \. \. \. \. ", jtxlines[ih]): if IsNotQuiet(): print "--%s--" % jtxlines[ih] raise unexception( "missing country", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) if re.match("<b>Agenda\s*</b>$", jtxlines[ih]): ih += 1 break if"Agenda", jtxlines[ih]): print ih, jtxlines raise unexception( "unextracted Agenda (should be <b>?)", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) ih += 1 # could be a closed meeting if not alltext = " ".join(jtxlines) if "OFFICIAL COMMUNIQU..*?Held in private (?:in the Security Council Chamber )?at Headquarters(?i)", alltext): return False return True while ih < len(jtxlines): if re.match("\d\d-\d\d", jtxlines[ih]): break if re.match("\d\d.?\d\d\d\d\d \(E\)", jtxlines[ih]): break if re.match( "This record contains the text of speeches delivered in English", jtxlines[ih]): break #print "agagag", jtxlines[ih] assert not"text of speeches|verbatim(?i)", jtxlines[ih]) self.agenda.append(jtxlines[ih].strip()) ih += 1 #print "ccccc", self.chairs lparanum = paranumC(self.undocname, None, 0, -1, self.textcountnumber) if len(self.chairs) not in (15, 17) or len(self.seccouncilmembers) != 15: if self.undocname == "S-PV-3446": return False if IsNotQuiet(): print len(self.seccouncilmembers), len( self.chairs ), "wrong number of members or chairs\n", self.chairs print self.seccouncilmembers raise unexception("wrongnumber on council", lparanum) self.agenda = " ".join(self.agenda) self.agenda = re.sub("</?b>", " ", self.agenda) self.agenda = re.sub("\s\s+", " ", self.agenda) self.agenda = MarkupLinks( CleanupTags(self.agenda, "council-agenda", lparanum), self.undocname, lparanum) return True
def __init__(self, xpage, lundocname, lpageno, textcountnumber): self.pageno = lpageno self.undocname = lundocname self.textcountnumber = textcountnumber self.bSecurityCouncil = re.match("S-PV.(\d+)", self.undocname) self.nSecurityCouncilSession = self.bSecurityCouncil and int( or 0 self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname) assert self.bSecurityCouncil or self.bGeneralAssembly # for right column, if not left justified, this adds a bit more to the right if self.bGeneralAssembly and int( re.match("A-(\d+)", lundocname).group(1)) <= 52: rightcolstartindentincrement = 1 else: rightcolstartindentincrement = 0 # set the column starts from some of the special cases we get leftcolstart = 90 if self.bGeneralAssembly and int( re.match("A-(\d+)", lundocname).group(1)) <= 54: rightcolstart = 481 else: rightcolstart = 468 if lundocname in [ "A-54-PV.100", "A-54-PV.96", "A-54-PV.98", "A-54-PV.99", "S-PV-4143", "S-PV-4143-Resu.1" ]: rightcolstart = 468 elif lundocname in ["A-54-PV.97"]: rightcolstart = 486 elif re.match("S-PV-335[0-8]", lundocname): rightcolstart = 468 elif re.match("S-PV-334", lundocname): rightcolstart = 468 elif self.nSecurityCouncilSession >= 4144: rightcolstart = 468 #re.match("S-PV-414[4-9]", lundocname): # rightcolstart = 468 #elif re.match("S-PV-41[5-9]", lundocname): # rightcolstart = 468 #elif re.match("S-PV-4[2-9]", lundocname): # rightcolstart = 468 #elif re.match("S-PV-5", lundocname): # rightcolstart = 468 elif self.bSecurityCouncil: rightcolstart = 481 rightcolstartindentincrement = 1 # generate the list of lines, sorted by vertical position ftxlines = re.findall("<text.*?</text>", xpage) txlines = [] for txline in ftxlines: txl = TextLine(txline, lundocname, lpageno, self.textcountnumber) self.textcountnumber += 1 if txl.ltext: if txlines and txlines[-1].bfootertype and txlines[ -1].top == txl.bfootertype = True txlines.append(txl) txlines.sort(key=TextLineTopKey) # the half divider is at 459 # try to separate out the header and footers if self.pageno == 1 and self.bGeneralAssembly: ih = self.ExtractDotLineChairHead(txlines) #for Dtxl in txlines[-10:]: # print, Dtxl.left, Dtxl.ltext ie = len(txlines) - 1 while txlines[ie].bfootertype: #print "FOOTER:", txlines[ie].ltext ie -= 1 #print "**NON-FOOTER:", txlines[ie].ltext ie += 1 # the whole first page gets parsed separately assert not self.bSecurityCouncil elif self.bSecurityCouncil and self.pageno == 1: if not self.ExtractSeccounFrontPage(txlines): self.bSecurityCouncil = "ClosedSession" return # special case where the agenda spills to a second page (don't forget the outer application of this if) elif self.bSecurityCouncil and lundocname in twopageagendas and self.pageno == 2: ih = 0 self.agenda = [] while ih < len(txlines): if 132 <= txlines[ih].top < 1000: self.agenda.append(txlines[ih].ltext) ih += 1 self.agenda = " ".join(self.agenda) self.agenda = re.sub("</?b>", " ", self.agenda) self.agenda = re.sub("\s\s+", " ", self.agenda) lparanum = paranumC(self.undocname, None, 0, -1, self.textcountnumber) self.agenda = MarkupLinks( CleanupTags(self.agenda, "council-agenda", lparanum), self.undocname, lparanum) return elif self.bGeneralAssembly: if re.match("<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[0].ltext): ih = 1 elif re.match("\d", txlines[0].ltext) and re.match( "<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[1].ltext): ih = 2 else: #print txlines[0].ltext assert re.match("General Assembly", txlines[0].ltext), txlines[0].ltext assert re.match("\d+(?:th|st|nd|rd) (?:plenary )?meeting", txlines[1].ltext) assert re.match("\S+ [Ss]ession", txlines[2].ltext) assert re.match("\d+ \w+ \d\d\d\d", txlines[3].ltext) or ( lundocname in ["A-50-PV.38", "A-50-PV.40"]) ih = 4 ie = len(txlines) - 1 if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext): ie -= 1 pagenumtext = re.sub("<..?>", "", txlines[ie].ltext).strip() if re.match("\d\d\-\d\d\d\d\d", txlines[ie - 1].ltext): ie -= 1 if not re.match("\d+$", pagenumtext): if IsNotQuiet(): print "jjjj", pagenumtext, txlines[ie].ltext raise unexception( "pagenum error not a number", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) if int(pagenumtext) != self.pageno: if IsNotQuiet(): print pagenumtext, self.pageno raise unexception( "pagenum serror of speaker-intro", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) elif self.bSecurityCouncil: #if len(txlines) < 4: # raise unexception("intro too short", paranumC(self.undocname, None, 0, -1, txlines[0].textcountnumber)) bl0 = len(txlines) > 4 and re.match("Security Council", txlines[0].ltext) bl1 = len(txlines) > 4 and re.match( "\d+(?:th|st|nd|rd)? (?:\(Resumption(?: \d)?\) )?(?:meeting)?", txlines[1].ltext) bl2 = len(txlines) > 4 and re.match("(\w+-\w+|\w+) [Yy]ear", txlines[2].ltext) bl3 = len(txlines) > 4 and re.match("\d+ \w+ \d\d\d\d", txlines[3].ltext) bl4 = re.match( "<b>S/PV.\d+\s*(?:\(Resumption [\d|I]\)|\(Part [I]+\))?\s*</b>", txlines[0].ltext) bl4r = (self.undocname[5:] >= "4143") if bl4 and bl4r: ih = 1 elif bl0 and bl1 and bl2 and bl3: ih = 4 else: if IsNotQuiet(): print "\nFirst four lines on page:", self.pageno, bl4, bl4r print bl0, txlines[0].ltext print bl1, txlines[1].ltext print bl2, txlines[2].ltext print bl3, txlines[3].ltext print bl4, bl4r raise unexception( "bad page header", paranumC(self.undocname, None, 0, -1, txlines[0].textcountnumber)) ie = len(txlines) - 1 if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext): ie -= 1 pagenumtext = txlines[ie].ltext mpagenumtext = re.match("(?:<b>)?(\d+)\s*(?:</b>)?$", pagenumtext) if not mpagenumtext: if IsNotQuiet(): print "jkjk", pagenumtext raise unexception( "pagenum error not a number", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) pgoffset = int( - self.pageno if pgoffset != 0 and self.undocname not in misnumberedpages: if IsNotQuiet(): print "pagenum-offset not in list", self.undocname, 1), self.pageno raise unexception( "page pagenum error of speaker-intro", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) if re.match("\d\d-\d\d\d\d\d$", txlines[ie - 1].ltext): ie -= 1 else: assert False # separate out the header and footers self.txlheader = txlines[:ih] self.txlfooter = txlines[ie:] # separate the body into the two columns self.txlcol1 = [] self.txlcol2 = [] self.minindentleft = 9999 self.minindentright = 9999 for txl in txlines[ih:ie]: if txl.left < 459: #print txl.bfootertype, txl.left, txl.width,, txl.ltext # zzzz # there's a bit of spilling out where the region is larger than it should be for the words as in A-56-PV.64 if not (txl.left + txl.width <= 459): if txl.left + txl.width > 501: if IsNotQuiet(): print txl.left, txl.width, txl.left + txl.width print txl.ltext print "might have page no. 1 on first page (or add to twopageagendas)" raise unexception( "right-hand extension excessive", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) if not (txl.left <= 165): bc = -1 while True: assert self.txlcol1[-1].txls[ bc].top == # in-line but shorter if (self.txlcol1[-1].txls[bc].left <= 165): break bc -= 1 txl.indent = txl.left - leftcolstart if txl.indent < 0: if IsNotQuiet(): print txl.indent, txl.ltext raise unexception( "negative indentation", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) self.minindentleft = min(txl.indent, self.minindentleft) txl.brightcol = False AppendToCluster(self.txlcol1, txl) else: txl.indent = txl.left - rightcolstart if txl.indent != 0: txl.indent += rightcolstartindentincrement if txl.indent < 0: if IsNotQuiet(): print txl.indent, txl.left, rightcolstart print txl.ltext raise unexception( "negative indent on righthand column", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) self.minindentright = min(txl.indent, self.minindentright) txl.brightcol = True AppendToCluster(self.txlcol2, txl)
def __init__(self, xfil, undocname): self.sdate = None self.chairs = None self.agenda = None self.tlcall = None self.seccouncilmembers = None self.bSecurityCouncil = re.match("S-PV.\d+", undocname) self.bGeneralAssembly = re.match("A-\d+-PV", undocname) xpages = StripPageTags(xfil, undocname) if not xpages: return # bitmap type encountered txpages = [] self.tlcall = [] for i in range(len(xpages)): txpage = TextPage(xpages[i], undocname, i + 1, (txpages or 0) and txpages[-1].textcountnumber) if i == 0 and txpage.bSecurityCouncil == "ClosedSession": if IsNotQuiet(): print " -- closedsession" self.tlcall = None return # closed session encountered txpages.append(txpage) if txpage.bSecurityCouncil and i == 0: continue # special cases of agenda overflowing into two pages if txpage.bSecurityCouncil and i == 1 and undocname in twopageagendas: txpages[0].agenda = "%s %s" % ( txpages[0].agenda, txpage.agenda ) # ram it all into one paragraph (who cares) continue bmissingcolumns = undocname in ["A-61-PV.106", "A-52-PV.39"] if txpage.txlcol1: AppendCluster(self.tlcall, txpage.txlcol1[0], "newpage") for tlc in txpage.txlcol1[1:]: AppendCluster(self.tlcall, tlc, "gapcluster") elif not bmissingcolumns: #assert i == len(xpages) - 1 # only last page can have missing columns (sometimes it's the first) print "page", i, "of", len(xpages) #print txpages[-1].textcountnumber raise unexception( "missing column not on last page", paranumC(undocname, None, 0, -1, txpages[-1].textcountnumber)) # have had a case where the first column was the blank one if txpage.txlcol2: AppendCluster(self.tlcall, txpage.txlcol2[0], "newcolumn") for tlc in txpage.txlcol2[1:]: AppendCluster(self.tlcall, tlc, "gapcluster") elif not bmissingcolumns: assert i == len(xpages) - 1, "%d != %d" % (i, len(xpages) - 1) # assign ids to the clusters self.sdate = txpages[0].date paranumlast = paranumC(undocname, self.sdate, 0, -1, 0) for tlc in self.tlcall: if tlc.txls[0].pageno == paranumlast.pageno: paranumlast = paranumC(undocname, self.sdate, paranumlast.pageno, paranumlast.paragraphno + 1, tlc.txls[0].textcountnumber) else: paranumlast = paranumC(undocname, self.sdate, tlc.txls[0].pageno, 1, tlc.txls[0].textcountnumber) tlc.paranum = paranumlast # merge the lines together and remove double bold/italics that happen across lines for tlc in self.tlcall: jparatext = [] # don't insert spaces where there is a hyphen for txl in tlc.txls: if jparatext and not ("\w[-/]$", jparatext[-1]) and re.match("\w", txl.ltext)): jparatext.append(" ") jparatext.append(txl.ltext) tlc.paratext = "".join(jparatext) tlc.paratext = re.sub("-</i> <i>", "-", tlc.paratext) tlc.paratext = re.sub("-</b> <b>", "-", tlc.paratext) tlc.paratext = re.sub("</b>\s*\.\s*<b>", ". ", tlc.paratext) tlc.paratext = re.sub("Secretary- General", "Secretary-General", tlc.paratext) tlc.paratext = re.sub( "\s*(?:</i>\s*<i>|</b>\s*<b>|<b>\s*</b>|<i>\s*</i>|<b>\s*<i>\s*</b>\s*</i>)\s*", " ", tlc.paratext) tlc.paratext = tlc.paratext.strip() tlc.paratext = re.sub( "^<b>(The(?: Acting)? Co-Chairperson) \(([^\)]*)\)\s*(?:</b>\s*:|:\s*</b>)", "<b>\\1</b> (\\2):", tlc.paratext) tlc.lastindent = tlc.indents[-1][0] self.agenda = txpages[0].agenda self.chairs = txpages[0].chairs if self.bSecurityCouncil: self.seccouncilmembers = txpages[0].seccouncilmembers
def DetectSpeaker(ptext, indents, paranum, speakerbeforetookchair): #print ptext, "\n\n\n" if re.match("<i>(?:In favour|Against|Abstaining)", ptext): # should be part of a voteblock print ptext #print tlcall[i - 1].paratext assert False if re.match( "(?:The agenda was adopted\.|A vote was taken by show of hands\.|There being no objection, it is so decided\.)$", ptext): if IsNotQuiet(): print "italicizingline", len(indents), ptext ptext = "<i>%s</i>" % ptext indentationerror = "" if len(indents) == 1 and indents[0][0] == 0: if not re.match("<b> ", ptext) and not re.match( "(?:\(|<i>)+spoke in", ptext ): # often there is a speaker with a blank space at the front indentationerror = "unindented-paragraph" if len(indents) > 2: indentationerror = "too many different indents" if len(indents) == 2 and indents[1][0] != 0: if (indents[0][1] == 1 and ptext[0] == '"' and indents[0][0] - indents[1][0] > 30): # turn this into a blockquote indents[0] = (indents[0][0], indents[0][1] + indents[1][1], indents[0][2] + indents[1][2]) del indents[1] if IsNotQuiet(): pass #print "ququququq", indents else: indentationerror = "un-left-justified paragraph" mfixchinaspek = re.match( "<b>(Mr\. \w+)\s*</b>\s*([\w\-]+)\s*\((?:China|Republic of Korea)\)", ptext) if mfixchinaspek: #print "fixing chinaspeak", ptext, "\n" ptext = "<b>%s %s</b> %s" % (,, ptext[mfixchinaspek.end(2):]) #print ptext if"\s\S\s\S\s\S\s", ptext): print ptext raise unexception("probable gaps in text", paranum) mspek = re.match(respekp1, ptext) if not mspek: mspek = re.match(respekp2, ptext) if not mspek: mspek = re.match(respekp3, ptext) if not mspek: mspek = re.match(respek, ptext) assert not mspek or not"[<>]", if not mspek and re.match("<[ib]>", ptext): speakerbeforetookchair = "" if mspek or speakerbeforetookchair: if indentationerror == "unindented-paragraph" and speakerbeforetookchair: indentationerror = False if indentationerror == "unindented-paragraph" and paranum.undocname in [ "A-55-PV.60", "A-55-PV.63", "A-55-PV.64", "A-55-PV.68", "A-55-PV.59", "A-55-PV.44", "A-55-PV.46", "A-55-PV.48", "A-55-PV.49", "A-55-PV.52", "A-55-PV.56", "A-55-PV.51", "A-60-PV.37", "A-60-PV.38", "A-60-PV.42", "A-60-PV.51", "A-60-PV.79", "A-60-PV.85", "A-60-PV.91", "A-60-PV.86", "A-60-PV.87", "A-60-PV.92", "A-60-PV.93", "A-60-PV.94" ]: indentationerror = False if indentationerror: print ptext print indents raise unexception(indentationerror + " of speaker-intro", paranum) if respekSS and not mspek: m = re.match(respekSS, ptext) if IsNotQuiet(): print ptext print " ___ ", m and if mspek: assert not indentationerror assert not re.match("<i>", ptext) speakr = re.sub("\s+", " ", nation = "" bIsNotnation = True lnation = mbumpnation ="([^(]*?)\s*\(([^)]*)\)$", speakr) if mbumpnation and not lnation and FixNationName(, paranum.sdate): speakr = lnation = if IsNotQuiet(): print "BBBB bumpingnat", speakr, lnation if lnation: nation = IsPrenation(lnation, paranum.sdate) if not nation: nation = FixNationName(lnation, paranum.sdate) bIsNotnation = not nation if not nation: nation = IsNonnation(lnation, paranum.sdate) if not nation: print ptext print "\ncheck if misspelt or new nonnation, can add * to front of it: ", lnation raise unexception("unrecognized nationC or nonnation", paranum) elif not re.match( "The(?: Acting| Temporary)? President|The(?: Deputy| Assistant)? Secretary-General|The(?: Acting)? Chairman|Transcript", speakr): if IsNotQuiet(): # allow for less strict when done by cronjob raise unexception("missing nation for %s" % speakr, paranum) if not re.match( "Mr\.|Mrs\.|Miss |Ms\.|Pope |The |King |Sultan |Prince |Secretary|Arch|Dr\.|Sir |Sheikh?a? |President |Monsignor |Chairman |Crown |His |Dame |Senator |Cardinal |Chief |Captain |Acting |Begum |Major-General |Shaikh |Judge |Count |Emir |Baroness |General |Nana |Princess |U |Rev\. |Kofi |Sayyid |Sheika |Bishop |Sir. |Wilmot |Eliza |Jos|Lord |Justice |Father |Commodore |Metropolitan |Transcript|Madam ", speakr): print speakr raise unexception("improper title on speaker", paranum) if"[\.,:;]$", speakr): print speakr raise unexception("improper tail on speaker", paranum) if"[,:;\(\)]", speakr): print speakr raise unexception("improper contents in speaker", paranum) typ = "spoken" currentspeaker = (speakr, nation, ( or ""), bIsNotnation ) # name, nation, language #print currentspeaker ptext = ptext[mspek.end(0):] if"</b>", ptext): print ptext raise unexception("bold in spoken text", paranum) elif speakerbeforetookchair: assert not indentationerror typ = "spoken" currentspeaker = speakerbeforetookchair #print "Continuation speaker", speakerbeforetookchair # non-spoken text else: #<b>Mr. Al-Mahmoud </b>(Qatar) (<i>spoke in Arabic</i>): if re.match("<b>.*?(?:</b>.*?:|:</b>)(?!</b>$)", ptext): print ptext raise unexception("improperly detected spoken text", paranum) if re.match("\(?<i>", ptext): mballots ="Number of ballot papers", ptext) if mballots: #print "BALLOT:", ptext, "\n" indentationerror = False if indentationerror: print ptext print indents raise unexception(indentationerror + " of unspoken text", paranum) if not mballots: mptext = re.match( "<i>(.*?)</i>\.?\s*(?:\((?:resolutions?|decision|draft resolution) (A?[\d/]*\s*(?:\(?[A-Z,\s]*(?:and|to) [A-Z]\)?|[A-Z]{1,2})?)\))?\.?$", ptext) if not mptext and not re.match("\(<i>spoke in", ptext): print "--%s--" % ptext raise unexception("improper italicline", paranum) ptext = re.sub("</?[ib]>", "", ptext).strip() # further parsing of these phrases may take place in due course msodecided = re.match( "(?:There being no objection, )?[Ii]t (?:was|is) so decided(?: \(decision [\d/]*\s*(?:A|B|C|A and B)?\))?\.?$", ptext) mwasadopted = re.match( ".*?(?:resolution|decision|agenda|amendment|recommendation).*?(?:was|were) adopted(?i)", ptext) mcalledorder = re.match( "The meeting (?:was called to order|rose|was suspended|was adjourned|resumed|was resumed) (?:at|on)", ptext) mtookchair = re.match( "\s*(?:In the absence of the President, )?(.*?)(?:, \(?Vice[\-\s]President\)?,)? (?:took|in) the [Cc]hair\.?$", ptext) mretchair = re.match( "(?:The President|.*?, Vice-President,|Mrs. Albright.*?|Baroness Amos) (?:returned to|in) the Chair.$", ptext) mescort = "(?:was escorted|escorted the.*?) (?:(?:from|to) the (?:rostrum|podium|platform)|(?:from|into|to its place in) the (?:General Assembly Hall|Conference Room|Security Council Chamber))(?: by the President and the Secretary-General)?\.?$", ptext) msecball = "A vote was taken by secret ballot\.(?: The meeting was suspended at|$)", ptext) mminsil = "The (?:members of the (?:General )?Assembly|Council) observed (?:a|one) minute of (?:silent prayer (?:or|and) meditation|silence)\.$", ptext) mtellers = "At the invitations? of the (?:Acting )?Presidents?.*?acted as tellers\.$|Having been drawn by lot", ptext) melected = "[Hh]aving obtained (?:the required (?:two-thirds )?|an absolute )majority.*?(?:(?:were|was|been|is) s?elected|will be included [io]n the list)", ptext) mmisc = "The Acting President drew the following.*?from the box|sang.*?for the General Assembly|The Assembly heard a musical performance|The Secretary-General presented the award to|From the .*? Group:|Having been drawn by lot by the (?:President|Secretary-General),|were elected members of the Organizational Committee|President \w+ and then Vice-President|Vice-President \S+ \S+ presided over|The following .*? States have.*?been elected members of the Security Council", ptext) mmiscnote ="\[In the 79th plenary .*? III.\]$", ptext) mmstar = re.match("\*", ptext) # insert * in the text mmspokein = re.match( "\(spoke in \w+(?:; interpretation.*?|; .*? the delegation)?\)$", ptext) matinvite = re.match( "(?:At the invitation of the President, )?.*? (?:(?:took (?:a |the )?|were escorted to their )seats? at the Council table|(?:took|was invited to take) (?:(?:the |a |their )?(?:seat|place)s? reserved for \w+|a seat|a place|places|seats|their seats|his seat) at the (?:side of the )?Council (?:[Cc]hamber|table))(?:;.*?Chamber)?\.$", ptext) mscsilence = re.match( "The members of the (?:Security )?Council observed a minute of silence.$", ptext) mscescort = "(?:were|was) escorted to (?:seats|a seat|his place|a place) at the (?:Security )?Council table.$", ptext) mvtape = re.match( "A video ?(?:tape)? was (?:shown|played|displayed) in the Council Chamber.$|An audio tape, in Arabic,|The members of the General Assembly heard a musical performance.$", ptext) mvprojscreen = re.match( "(?:An image was|Two images were|A video was) projected on screen\.$", ptext) mvresuadjourned = re.match( "The meeting was resumed and adjourned on.*? a\.m\.$", ptext) if mmstar: ptext = ptext[1:] # first line is from general assembly. Second line adds in some from security council if not (msodecided or mwasadopted or mcalledorder or mtookchair or mretchair or mballots or mescort or msecball or mminsil or mtellers or mmisc or melected or mmstar or mmiscnote or mmspokein or \ mvprojscreen or matinvite or mscsilence or mscescort or mvtape or mvresuadjourned): print "unrecognized--%s--" % ptext print re.match("At the invitations? of the (?:Acting )?", ptext) raise unexception("unrecognized italicline", paranum) # we can add subtypes to these italic-lines typ = "italicline" if mtookchair or mretchair: typ = "italicline-tookchair" if mmspokein: typ = "italicline-spokein" currentspeaker = None elif re.match("<b>", ptext): if not re.match(reboldline, ptext): print ptext raise unexception("unrecognized bold completion", paranum) ptext = re.sub("</?b>", "", ptext).strip() typ = "boldline" currentspeaker = None else: typ = "unknown" print ptext, indents raise unexception("possible indent failure", paranum) return ptext, typ, currentspeaker
def AppendCluster(res, tlc, sclusttype): # check if we should merge to the next paragraph assert sclusttype in ["gapcluster", "newpage", "newcolumn"] if res and sclusttype != "gapcluster" and len(tlc.indents) == 1: indentp = res[-1].indents[-1][0] indentn = tlc.indents[0][0] bbothindented = ((indentp in [31, 32]) and (indentn in [31, 32])) or \ ((indentp in [0, 1]) and (indentn in [0, 1])) or \ ((indentp in [36, 33]) and (indentp == indentn)) bonelineparacont = (len(res[-1].indents) == 1) and (res[-1].indents[0][1] == 1) and ( indentp in [31, 32]) and (indentn in [0, 1]) td0 = res[-1].txls[-1].ltext[:3] td1 = tlc.txls[0].ltext[:3] if not re.match("<[ib]>", td0): td0 = "" if not re.match("<[ib]>", td1): td1 = "" bstylematches = (td0 == td1) #assert not (bbothindented and not bstylematches) if re.match("<i>In favour", tlc.txls[0].ltext): bstylematches = False if re.match("<b>Agenda", res[-1].txls[-1].ltext): bstylematches = False # likely continuation of paragraph if bbothindented and bstylematches: res[-1].txls.extend(tlc.txls) #print tlc.txls[0].ltext return else: if bonelineparacont: if IsNotQuiet(): pass #print "checkthiscontinuation case" #print indentp, indentn, bstylematches, bonelineparacont, res[-1].indents #print " ----", tlc.txls[0].ltext if bstylematches: if IsNotQuiet(): pass #print "merging" res[-1].txls.extend(tlc.txls) return # new cluster; check the indenting pattern is good if len(tlc.indents) == 2: if tlc.indents[0] <= tlc.indents[1]: #print tlc.indents, tlc.txls[0].ltext #assert re.match("<[ib]>.*?</[ib]>", tlc.txls[0].ltext) # <i>In favour:</i> pass # two paragraphs may have been merged, try to separate them out elif len(tlc.indents) == 4 and tlc.indents[0][0] == tlc.indents[2][ 0] and tlc.indents[1][0] == tlc.indents[3][0]: if IsNotQuiet(): pass #print tlc.indents assert tlc.indents[0][0] == tlc.indents[2][0] assert tlc.indents[1][0] == tlc.indents[3][0] si = tlc.indents[0][2] + tlc.indents[1][2] tlcf = TextLineCluster(None) tlcf.txls = tlc.txls[:si] del tlc.txls[:si] tlcf.indents = tlc.indents[:2] del tlc.indents[:2] res.append(tlcf) if IsNotQuiet(): pass #print "# paragraphs", si #print " ", tlc.txls[0].ltext #print tlcf.indents, tlc.indents elif len(tlc.indents) != 1: if IsNotQuiet(): print tlc.indents, "jjjj" prevtop = -1 for txl in tlc.txls: if IsNotQuiet(): if prevtop == print " ", print txl.indent, txl.ltext prevtop = raise unexception( "unrecognized indent pattern", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) assert False res.append(tlc) return
def GroupParas(tlcall, undocname, sdate, seccouncilmembers): res = [] i = 0 currentspeaker = None curragendanum = "" while i < len(tlcall): tlc = tlcall[i] if re.match(recvoterequest, tlc.paratext): lblock = VoteBlock(tlcall, i, undocname, sdate, seccouncilmembers) i = lblock.i # non-voting line to be processed else: speakerbeforetookchair = "" if (len(res) > 2) and (res[-1].typ in [ "italicline-tookchair", "italicline-spokein" ]) and (res[-2].typ == "spoken"): speakerbeforetookchair = res[-2].speaker if res[-1].typ == "italicline-spokein": assert len(res[-1].paragraphs) == 1 mspokein ="spoke in (\w+)", res[-1].paragraphs[0][1]) if not mspokein: if IsNotQuiet(): print "unrecognized spokein", res[-1].paragraphs #print "converting spokein", speakerbeforetookchair[2], speakerbeforetookchair = (speakerbeforetookchair[0], speakerbeforetookchair[1],, speakerbeforetookchair[3]) lblock = SpeechBlock(tlcall, i, undocname, sdate, speakerbeforetookchair, curragendanum) if lblock.agendanum: curragendanum = lblock.agendanum i = lblock.i if res and res[-1].paranum.pageno == lblock.paranum.pageno: lblock.paranum.blockno = res[-1].paranum.blockno + 1 else: lblock.paranum.blockno = 1 res.append(lblock) # find the rosetime if res: res[-1].rosetime = res[-1].ExtractRoseTime(sdate[10:].strip()) if undocname in [ "S-PV-3698", "S-PV-3698-Resu.1", "S-PV-3765-Resu.2", "S-PV-4072-Resu.1", "S-PV-4174", "S-PV-4223", "S-PV-5100" ]: assert not res[-1].rosetime res[-1].rosetime = sdate[10:].strip() # the missing rosetimes if not res[-1].rosetime: if undocname == "A-62-PV.79": res[-1].rosetime = "06:05" else: res[-1].writeblock(sys.stdout) raise unexception("can't find rosetime", res[-1].paranum) return res
def ParsetoHTML(stem, pdfxmldir, htmldir, bforceparse, beditparse, bcontinueonerror): undocnames = [] for undoc in os.listdir(pdfxmldir): undocname = os.path.splitext(undoc)[0] if undoc[-1] == "~": continue if not re.match(stem, undocname): continue if"Corr", undocname): # skip corregendas continue if not bforceparse: undochtml = os.path.join(htmldir, undocname + ".html") undochtmlunindexed = os.path.join(htmldir, undocname + ".unindexed.html") if os.path.isfile(undochtml) or os.path.isfile(undochtmlunindexed): continue undocnames.append(undocname) undocnames.sort() if IsNotQuiet(): print "Preparing to parse %d files" % len(undocnames) for undocname in undocnames: undocpdfxml = os.path.join(pdfxmldir, undocname + ".xml") undochtml = os.path.join(htmldir, undocname + ".html") # used to be ".unindexed.html" gparas = None lbeditparse = beditparse while not gparas: fin = open(undocpdfxml) xfil = fin.close() if IsNotQuiet(): print "parsing:", undocname, try: if lbeditparse: lbeditparse = False raise unexception("editparse", None) glueunfile = GlueUnfile(xfil, undocname) if not glueunfile.tlcall: break # happens when it's a bitmap type, or communique if IsNotQuiet(): print glueunfile.sdate #, chairs gparas = GroupParas(glueunfile.tlcall, undocname, glueunfile.sdate, glueunfile.seccouncilmembers) except unexception, ux: assert not gparas if ux.description != "editparse": if bcontinueonerror: break print "\n\nError: %s on page %s textcounter %s" % ( ux.description, ux.paranum.pageno, ux.paranum.textcountnumber) print "\nHit RETURN to launch your editor on the pdfxml file (or type 's' to skip, or 't' to throw)" rl = sys.stdin.readline() if rl[0] == "s": break if rl[0] == "t": raise if ux.description != "editparse": fin = open(undocpdfxml, "r") finlines = fin.close() mfinlines = re.match( "(?s)(.*?<text ){%d}" % ux.paranum.textcountnumber, finlines) ln ="\n") else: ln = 1 #editor = os.getenv('EDITOR') if sys.platform == "win32": os.system('"C:\Program Files\ConTEXT\ConTEXT" %s /g00:%d' % (undocpdfxml, ln + 2)) else: os.system('vim "%s" +%d' % (undocpdfxml, ln + 2)) if not gparas: continue # actually write the file tmpfile = undochtml + "--temp" fout = open(tmpfile, "w") fout.write('<html>\n<head>\n') fout.write( '<link href="unview.css" type="text/css" rel="stylesheet" media="all">\n' ) fout.write('</head>\n<body>\n') fout.write('\n<div class="heading" id="pg000-bk00">\n') sdate, stime = glueunfile.sdate[:10], glueunfile.sdate[10:].strip() fout.write( '\t<span class="code">%s</span> <span class="date">%s</span> <span class="time">%s</span>' % (undocname, sdate, stime)) if gparas: fout.write('<span class="rosetime">%s</span>' % gparas[-1].rosetime) fout.write('\n</div>\n') if glueunfile.bSecurityCouncil: fout.write('\n<div class="council-agenda" id="pg000-bk01">\n') fout.write( '\t<p class="boldline-p" id="pg000-bk01-pa01">%s</p>\n' % glueunfile.agenda) fout.write('</div>\n') fout.write('\n<div class="council-attendees" id="pg000-bk02">\n') ichairn = 0 for chair in glueunfile.chairs: ichairn += 1 fout.write('\t<p id="pg000-bk02-pa%02d">' % ichairn) for chperson in chair[0].split( "/" ): # just for the extremely rare case we get two people sharing the seat fout.write('<span class="name">%s</span> ' % chperson.strip()) fout.write( '<span class="nation">%s</span> <span class="place">%s</span></p>\n' % (chair[1], chair[2])) fout.write('</div>') if glueunfile.bGeneralAssembly: fout.write('\n<div class="assembly-chairs" id="pg000-bk03">\n') ichairn = 0 for chair in glueunfile.chairs: ichairn += 1 fout.write( '\t<p id="pg000-bk03-pa%02d"><span class="name">%s</span> <span class="nation">%s</span> <span class="place">president</span></p>\n' % (ichairn, chair[0], chair[1])) fout.write('</div>\n') for gpara in gparas: gpara.writeblock(fout) # this for making the parsing a little easier fout.write('\n<div class="end-document" id="pg999-bk99">\n') fout.write('</div>\n') fout.write('\n</body>\n</html>\n') fout.close() if os.path.isfile(undochtml): os.remove(undochtml) os.rename(tmpfile, undochtml)
def __init__(self, tlcall, i, lundocname, lsdate, speakerbeforetookchair, prevagendanum): self.tlcall = tlcall self.i = i self.sdate = lsdate self.undocname = lundocname self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname) if not self.bSecurityCouncil: self.genasssess = re.match("A-(\d+)", self.undocname).group(1) self.agendanum = "" self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum # paranum = ( undocname, sdate, tlc.txls[0].pageno, paranumber ) #self.gid = self.paranum.MakeGid() tlc = self.tlcall[self.i] #print "\npppp", tlc.indents, tlc.paratext, tlc.txls ptext, self.typ, self.speaker = DetectSpeaker(tlc.paratext, tlc.indents, self.paranum, speakerbeforetookchair) ptext = MarkupLinks(CleanupTags(ptext, self.typ, self.paranum), self.undocname, self.paranum) self.i += 1 if self.typ in [ "italicline", "italicline-tookchair", "italicline-spokein" ]: self.paragraphs = [("italicline", ptext)] return # series of boldlines if self.typ == "boldline": self.agendanum = "" blinepara = tlc.lastindent and "blockquote" or "p" # detect the agenda if not self.bSecurityCouncil: self.agendanum = DetectAgendaForm(ptext, self.genasssess, prevagendanum, self.paranum) #print "aaaaa ", self.agendanum if not self.agendanum: if IsNotQuiet(): print "if no agenda, add to AgendaTypeMap" raise unexception(" uncategorized agenda title", self.paranum) self.paragraphs = [(blinepara, ptext)] while self.i < len(self.tlcall): tlc = self.tlcall[self.i] if not re.match(reboldline, tlc.paratext): break ptext = MarkupLinks( CleanupTags(tlc.paratext, self.typ, self.paranum), self.undocname, self.paranum) # a second agenda number gets found if not self.bSecurityCouncil and re.match( "Agenda(?: item)? \d+(?i)", ptext): agendanum2 = DetectAgendaForm(ptext, self.genasssess, prevagendanum, self.paranum) print "agendanum from second line", agendanum2 assert agendanum2, ptext # must detect it if"misc|show|address", self.agendanum): self.agendanum = agendanum2 # a woolly agenda can be over-ridden elif self.undocname == "A-62-PV.74": self.agendanum = "%s,%s" % (self.agendanum, agendanum2) else: print self.agendanum print ptext raise unexception(" unknown extra agendanum case", self.paranum) print "aaaa2aa ", self.agendanum self.paragraphs.append((tlc.lastindent and "boldline-indent" or "boldline-p", ptext)) self.i += 1 return # actual spoken section assert self.typ == "spoken" assert tlc.lastindent == 0 or len( tlc.indents) == 1 # doesn't happen in first paragraph of speech self.paragraphs = [("p", ptext)] while self.i < len(self.tlcall): tlc = self.tlcall[self.i] if self.DetectEndSpeech(tlc.paratext, tlc.lastindent, self.sdate): break ptext = MarkupLinks( CleanupTags(tlc.paratext, self.typ, self.paranum), self.undocname, self.paranum) bIndent = (len(tlc.indents) == 1) and ( tlc.indents[0][0] != 0) and (tlc.indents[0][1] > 1) self.paragraphs.append(((bIndent and "blockquote" or "p"), ptext)) self.i += 1