def StripPageTags(xfil, undocname): xpages = re.findall("(<page.*\s[\s\S]*?)</page>", xfil) mpage1head = re.match("([\s\S]*?)(?=<text)", xpages[0]) #print len(xpages), undoc if not mpage1head: if IsNotQuiet(): print " -- bitmap type" for xpage in xpages: if not re.match(pagebitmap, xpage): print xpage print undocname assert False return False if not re.match(page1bit, mpage1head.group(1)): if IsNotQuiet(): print "Probably is a bitmap type" print mpage1head.group(1) assert False return False res = [xpages[0][mpage1head.end(0):]] for i in range(1, len(xpages)): mpageihead = re.match(pageibit, xpages[i]) if int(mpageihead.group(1)) != i + 1: if undocname not in misnumberedpages: print "misnumberedpages", mpageihead.group( 1), i + 1, undocname, "not in list:", misnumberedpages assert False res.append(xpages[i][mpageihead.end(0):]) return res
def ExtractDateTime(self, txline, ltext): # extract the date out if poss mdate = re.match( "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?\s*m\.?| noon\.?)?(?: \(closed\))?$", ltext) if not mdate: #Tuesday, 3 December 2002, 10 a.m. if re.search("Friday", ltext) and IsNotQuiet(): print ltext, re.match( "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?m\.?| noon\.?)?(?: \(closed\))?", ltext) return #print txlines[ih].ltext iday = int(mdate.group(1)) if mdate.group(2) not in months: if IsNotQuiet(): print mdate.group(2), months raise unexception( "unrecognized month", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) imonth = months.index(mdate.group(2)) syear = mdate.group(3) if not re.match("(?:20\d\d|19\d\d)$", syear): raise unexception( "bad year", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) ihour = int(mdate.group(4)) imin = mdate.group(5) and int(mdate.group(5)) or 0 if mdate.group(6) and mdate.group(6) == "a" and ihour == 12: ihour = 0 elif mdate.group(6) and mdate.group(6) == "p" and ihour != 12: ihour += 12 if self.date: raise unexception( "date redefined", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) if not (0 <= ihour <= 23) or not (0 <= imin <= 59): if IsNotQuiet(): print ltext raise unexception( "bad time", paranumC(txline.undocname, None, 0, -1, txline.textcountnumber)) self.date = "%s-%02d-%02d %02d:%02d" % (syear, imonth + 1, iday, ihour, imin)
def ExtractPVlinks(meetingrecs): mpvcode = re.match("S/PV\.(\d+)\s*(?:\(Resumption\s*([I\d]*\))\s*)?(?:\(Part\s*(I*)\)\s*)?(\(closed\))?$", meetingrecs[0]) assert mpvcode, meetingrecs #print meetingrecs pvcode = "S-PV-%s" % mpvcode.group(1) meetingnumber = int(mpvcode.group(1)) secondarymeetingnumber = 0 if mpvcode.group(2): # needs to have the bracket so there is always something rv = mpvcode.group(2)[:-1] if not rv or rv == "I": rv = "1" pvcode = "%s-Resu.%d" % (pvcode, int(rv)) secondarymeetingnumber = int(rv) if mpvcode.group(3): assert not secondarymeetingnumber # parts and resu. don't mix if mpvcode.group(3) == "I": rp = 1 elif mpvcode.group(3) == "II": rp = 2 pvcode = "%s-Part.%d" % (pvcode, rp) secondarymeetingnumber = rp corrs = [ ] for corr in meetingrecs[1:]: if corr: mcorr = re.match("Corr\.(\d)\s*$", corr) assert mcorr, meetingrecs assert int(mcorr.group(1)) >= len(corrs) + 1, meetingrecs # sometimes misses a corr corrs.append("%s-Corr.%d" % (pvcode, int(mcorr.group(1)))) #print pvcode, meetingrecs[0] if mpvcode.group(4) and IsNotQuiet(): print "the closed one:", pvcode return pvcode, (meetingnumber, secondarymeetingnumber), corrs
def ConvertXML(stem, pdfdir, pdfxmldir, bForce): for sd in os.listdir(pdfdir): if stem and not re.match(stem, sd): continue sdn, sde = os.path.splitext(sd) if sde != ".pdf": continue pdf = os.path.join(pdfdir, sd) xmldest = os.path.join(pdfxmldir, sdn + ".xml") if os.path.isfile(xmldest): if not bForce: #if IsNotQuiet(): # print "skipping", sd continue os.remove(xmldest) #shutil.copyfile(pdf, pdfdest) tmpxml = "temph.xml" cmd = 'pdftohtml -xml "%s" "%s"' % (pdf, os.path.splitext(tmpxml)[0]) if IsNotQuiet(): print cmd else: cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch os.system(cmd) if not os.path.isfile(tmpxml): print "Failed to execute and generate file" print cmd continue if sys.platform == "win32" and os.path.isfile(xmldest): # can't rename onto existing file in Windows os.remove(xmldest) os.rename(tmpxml, xmldest)
def WriteSCSummaries(stem, scsummariesdir, htmldir, pdfdir): screcords = [ ] # this is iterating through the pages of indexes, so will be in order for lf in reversed(sorted(os.listdir(scsummariesdir))): if re.match("\.svn", lf): continue myear = re.search("\d\d\d\d", lf) assert myear, lf year = myear.group(0) if stem and not re.match(stem, year): continue if IsNotQuiet(): print "year", year f = os.path.join(scsummariesdir, lf) fin = open(f) ftext = fin.read() fin.close() for mrow in re.finditer('(?s)<tr valign="top">(.*?)</tr>', ftext): row = mrow.group(1).strip() screcord = SCrecord(year, row, htmldir) screcord.FindTopicCats(htmldir, pdfdir) screcord.nextpvcode = screcords and screcords[-1].pvcode or None screcords.append(screcord) model.load_sc_topics(screcord.pvcode, screcord.otopicrecstr, screcord.datetime, screcord.datetimeend, screcord.topics, screcord.minutes, screcord.numspeeches, screcord.numparagraphs, screcord.numvotes, screcord.nextpvcode)
def ExtractDotLineChair(self, txlines, ih): assert self.pageno == 1 #<text top="334" left="185" width="584" height="17" font="2">Mr. Kavan . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . (Czech Republic)</text> while True: #print "------" + txlines[ih].ltext mchair = re.search("([^>:]*?)\s*\. \. \. \. \.", txlines[ih].ltext) if mchair: break # fix missing year date #if self.undocname == "A-55-PV.44" and txlines[ih].ltext == "Monday, 30 October, 10 a.m.": # txlines[ih].ltext = "Monday, 30 October 2000, 10 a.m." self.ExtractDateTime(txlines[ih], txlines[ih].ltext) ih += 1 if ih == len(txlines): return -1 if not self.date: if IsNotQuiet(): for i in range(ih): print "--%s--" % txlines[i].ltext raise unexception( "dotlinechair date problem", paranumC(txlines[ih].undocname, None, 0, -1, txlines[ih].textcountnumber)) assert False # when country name for the president . . . . is not on same line mcountry = re.search("\((.*?)\)$", txlines[ih].ltext) if not mcountry: ih += 1 #print txlines[ih].ltext mcountry = re.match("\((.*?)\)$", txlines[ih].ltext) if not mcountry: if IsNotQuiet(): print txlines[ih].ltext raise unexception( "unable to extract country from ...-line", paranumC(txlines[ih].undocname, None, 0, -1, txlines[ih].textcountnumber)) ih += 1 chairname = re.sub("\s\s+", " ", mchair.group(1)).strip() self.chairs.append( (chairname, FixNationName(mcountry.group(1), self.date))) return ih
def ScrapeContentsPageFromStem(stem): # this attempts to scrape PV and corrigenda assembly vertbatims by generating the codes # we could lead on from the last known mpv = re.match("A-(\d+)-PV$", stem) if mpv: # these should search for gaps repv = re.compile("A-%s-PV.(\d+)(?:-Corr.(\d+))?" % mpv.group(1)) pvdone = [] for f in os.listdir(pdfdir): mfm = repv.match(f) if mfm: pvdone.append(int(mfm.group(1))) # onwards values pvdone.sort() if IsNotQuiet(): print "pvddd", pvdone v = (pvdone and pvdone[-1] or 0) vn = v + 1 while vn - v < 3: if ScrapePDF("A-%s-PV.%d" % (mpv.group(1), vn)): v = vn ScrapePDF("A-%s-PV.%d-Corr.1" % (mpv.group(1), vn)) vn += 1 # missing values while len(pvdone) >= 2: vn = pvdone[-1] - 1 if pvdone[-2] < vn: if ScrapePDF("A-%s-PV.%d" % (mpv.group(1), vn)): ScrapePDF("A-%s-PV.%d-Corr.1" % (mpv.group(1), vn)) pvdone[-1] = vn else: del pvdone[-1] return # this works from other contents pages for general assemblies if stem in scrapepvurlmap: ScrapeContentsPage(scrapepvurlmap[stem]) return # security council scrapage mspv = re.match("S-(\d+)-PV", stem) if mspv: assert 1994 <= int(mspv.group(1)) < 2009 # should use current yeaR ScrapeSCContentsPage(int(mspv.group(1)), "http://www.un.org/Depts/dhl/resguide/scact%s.htm" % mspv.group(1)) return print "Allowable stems for scraping are 'A-\d\d-PV' or 'S-\d\d\d\d(year)-PV', or" print ",\n ".join(scrapepvurlmap.keys()) assert False
def WriteAgendaSummaries(stem, htmldir): rels = GetAllHtmlDocs("", False, False, htmldir) agendagroups = {} for htdoc in rels: maga = re.search("(A-\d\d-PV\.\d+)\.(?:unindexed\.)?html", htdoc) masc = re.search( "(S-PV.\d+(?:-(?:Resu|Part)\.\d)?)\.(?:unindexed\.)?html", htdoc) if not maga: if not masc: print "Whatis", htdoc continue docid = maga.group(1) if stem and not re.match(stem, docid): continue fin = open(htdoc) ftext = fin.read() fin.close() mdate = re.search('<span class="date">(\d\d\d\d-\d\d-\d\d)</span>', ftext) sdate = mdate.group(1) if IsNotQuiet(): print docid, agendasdoc = AddAgendaGroups(agendagroups, sdate, docid, ftext) #if len(agendagroups) > 100: # print "preeeematureabort" # break # copy agenda data into database gaagindoc = [] for ag in agendasdoc: gaagindoc.append( (ag.subheadingid, ag.agendanumstr, "||".join(ag.titlelines))) model.load_ga_debate(docid, sdate, gaagindoc) # the agendagroups are lists of agenda items; call them topics allagendas = [] recentagendas = [] for agendanum, aggroup in agendagroups.iteritems(): agsession = aggroup[0][1].nsession mctitle, mccategory = FindDelCommonTitle(agendanum, aggroup) model.load_ga_agendanum(agsession, agendanum, mctitle, mccategory, [(ag.docid, ag.subheadingid) for ag0, ag in aggroup])
def ScrapeGASummaries(gasummariesdir): for sess in range(1, currentsession + 1): f = os.path.join(gasummariesdir, "gaact%d.html" % sess) url = GASummariesURL(sess) if sess == currentsession or (sess == currentsession - 1 and currentmonth == 9) or not os.path.isfile(f): if IsNotQuiet(): print "Scraping", url fin = urllib2.urlopen(url) gaindext = fin.read() fin.close() fout = open(f, "w") fout.write(gaindext) fout.close()
def ScrapeContentsPage(contentsurl): if IsNotQuiet(): print "URL index:", contentsurl fin = urllib2.urlopen(contentsurl) plenaryindex = fin.read() fin.close() # <a href="http://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=A/57/PV.1&Lang=E" target="_blank">A/57/PV.1</a> plenaryindexlist = re.findall('<a\s+href="(http://daccess[^"]*)" target="_blank">(.*?)</a>(?is)', plenaryindex) if not plenaryindexlist: plenaryindexlist = re.findall('<a target="_blank" href="(http://daccess[^"]*)">(.*?)</a>(?i)', plenaryindex) for plenary in plenaryindexlist[:]: undocname = re.sub("/", "-", plenary[1]) undocname = re.sub("\s|<.*?>", "", undocname) undocname = re.sub("SecurityCouncilresolution", "S-RES-", undocname) assert re.match("(?:A-RES-\d\d-\d+|A-\d\d-PV-\d+|S-RES-\d+\(\d+\))$", undocname) ScrapePDF(undocname, contentsurl, plenary[0])
def __init__(self, txline, lundocname, lpageno, textcountnumber): mxline = re.match( '<text top="(\d+)" left="(\d+)" width="-?(\d+)" height="(\d+)" font="(\d+)">(.*?)</text>', txline) if not mxline: print txline, "tttttt" self.top = int(mxline.group(1)) self.left = int(mxline.group(2)) self.width = int(mxline.group(3)) self.height = int(mxline.group(4)) self.font = int(mxline.group(5)) self.pageno = lpageno self.undocname = lundocname self.textcountnumber = textcountnumber self.ltext = mxline.group(6).strip() self.ltext = re.sub("<i>\s*</i>|<b>\s*</b>", " ", self.ltext) if re.match("<[ib]>\s*</[ib]>|\s*$", self.ltext): self.ltext = "" # will be removed if not self.ltext: return self.bfootertype = (self.left < 459 and self.left + self.width > 459) or re.match( footertext, self.ltext) #if self.bfootertype: # print self.ltext # move on any short bits that are like 13^(th) if self.height == 11 and not self.bfootertype and self.width <= 10: #print self.left, self.width, "'%s'" % self.ltext assert self.width <= 10 if self.ltext not in ["th", "rd", "st", "nd"]: if IsNotQuiet(): print self.ltext raise unexception( "unrecognized shortbit", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) self.top += 2 # push the step down from 16 to 18
def ScrapeSCSummaries(scsummariesdir): #print "Skipping ScrapeSCSummaries" #return currentdate = datetime.date.today() currentyear = currentdate.year currentmonth = currentdate.month for y in range(1994, currentyear + 1): f = os.path.join(scsummariesdir, "scact%d.html" % y) url = "http://www.un.org/Depts/dhl/resguide/scact%d.htm" % y if y == currentyear or (y == currentyear - 1 and currentmonth == 1) or not os.path.isfile(f): if IsNotQuiet(): print "Scraping", url fin = urllib2.urlopen(url) scindext = fin.read() fin.close() fout = open(f, "w") fout.write(scindext) fout.close()
def LoadAllVotes(rels): res = { } # { nation : { voterecid: vote } } for nation in nationdates: res[nation] = { } res["Brunei Darussalam"] = {}# quick fix for rel in rels: if IsNotQuiet(): print "loading:", rel fin = open(rel) doccontent = fin.read() fin.close() document_id = re.search('<span class="code">([^<]*)</span>', doccontent).group(1) for recvotet in re.findall('<p class="votelist" id="(pg\d+-bk\d+)-pa\d+">(.*?)</p>', doccontent): #print document_id, recvotet[0] for voten in re.findall('<span class="([^"]*)">([^<]*)</span>', recvotet[1]): res[voten[1]][(document_id, recvotet[0])] = re.match(".*?([^\-]*)", voten[0]).group(1) #print res["Sudan"] return res
def AppendToCluster(txlcol, txl): # frig the indentation on the most common mistakes if re.match( "<i>The meeting (?:was called|was suspended|rose at|was resumed)", txl.ltext) and (txl.indent == 0): txl.indent = 31 if not txlcol: txlcol.append(TextLineCluster(txl)) return txl.vgap = txl.top - txlcol[-1].txls[-1].top #print txlcol[-1].txls[-1].ltext #print txl.vgap, txl.width, txl.height, txl.top, txl.ltext # zzzz # frig vgaps in some cases where the spacing was wider than normal if txl.undocname in ["A-50-PV.84", "A-50-PV.88"]: if txl.vgap == 21 or txl.vgap == 22: txl.vgap = 18 if txl.vgap == 42: txl.vgap = 43 if txl.undocname == "S-PV-5584": if txl.vgap == 20: txl.vgap = 19 if not txl.vgap in familiarvgaps: if IsNotQuiet(): print "\n\n vgap=", txl.vgap, "\n\nwidth/height/top", txl.width, txl.height, txl.top, txl.ltext # zzzz print " familiar vgaps:", familiarvgaps raise unexception( "vgap not familiar", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) if txl.vgap in (0, 17, 18, 19) or txl.vgap == 0: txlcol[-1].AddLine(txl) else: #print txl.vgap, "vvvv", txl.ltext txlcol.append(TextLineCluster(txl))
def DetectAdoption(self): adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext) madtext = re.search( "(adopted|carried|retained.*?|rejected)(?:, as amended,|, as a whole,)?\s+by(?: votes)?\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?", adtext) if not madtext: madtext = re.match( "(By)\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?", adtext) if not madtext: print "--%s-- %d" % (adtext, self.i) raise unexception("by votes problem", self.tlcall[self.i].paranum) ifavour = int(madtext.group(2)) iagainst = (madtext.group(3) != "none" and int(madtext.group(3)) or 0) #if madtext.group(1) == "rejected": # i = ifavour; ifavour = iagainst; iagainst = i iabstain = (madtext.group(4) and int(madtext.group(4)) or 0) if madtext.group(1) == "rejected": il = (iagainst, ifavour, iabstain) else: il = (ifavour, iagainst, iabstain) ivl = (len(self.vlfavour), len(self.vlagainst), len(self.vlabstain)) if il != ivl: if IsNotQuiet(): print "wrong-count", self.undocname, il, ivl # wrong values are found on A-57-PV.73 s(favour=154, 152) if self.undocname not in [ "A-56-PV.82", "A-57-PV.73", "A-58-PV.54", "A-52-PV.69", "A-50-PV.90", "A-49-PV.83", ]: raise unexception("wrong votecount", self.tlcall[self.i].paranum) self.motiontext = MarkupLinks(adtext, self.undocname, self.paranum) self.i += 1
# in capital letters. Not currently incorporated into the system. if bGAsummaries: agsummariesdir = os.path.join(indexstuffdir, "gasummariesdir") if not os.path.isdir(agsummariesdir): os.mkdir(agsummariesdir) ScrapeGASummaries(agsummariesdir) sess = 1 ParseScrapeGASummaries(agsummariesdir, pdfinfodir, sess) if bNationData: ScrapePermMissions() NationDataSucker() if bVoteDistances: f = os.path.join(indexstuffdir, "votetable.txt") if IsNotQuiet(): print "Writing vote distance to file:", f fout = open(f, "w") WriteVoteDistances(stem, htmldir, fout) fout.close() if bDocimages: GenerateDocimages(stem, options.forcedocimg, options.limit, pdfdir, pdfpreviewdir, pdfinfodir, tmppdfpreviewdir) # this may be out-dated if bScrapewp: FetchWikiBacklinks(commentsdir) if bLoadMPs: LoadMPs()
def ParsetoHTML(stem, pdfxmldir, htmldir, bforceparse, beditparse, bcontinueonerror): undocnames = [] for undoc in os.listdir(pdfxmldir): undocname = os.path.splitext(undoc)[0] if undoc[-1] == "~": continue if not re.match(stem, undocname): continue if re.search("Corr", undocname): # skip corregendas continue if not bforceparse: undochtml = os.path.join(htmldir, undocname + ".html") undochtmlunindexed = os.path.join(htmldir, undocname + ".unindexed.html") if os.path.isfile(undochtml) or os.path.isfile(undochtmlunindexed): continue undocnames.append(undocname) undocnames.sort() if IsNotQuiet(): print "Preparing to parse %d files" % len(undocnames) for undocname in undocnames: undocpdfxml = os.path.join(pdfxmldir, undocname + ".xml") undochtml = os.path.join(htmldir, undocname + ".html") # used to be ".unindexed.html" gparas = None lbeditparse = beditparse while not gparas: fin = open(undocpdfxml) xfil = fin.read() fin.close() if IsNotQuiet(): print "parsing:", undocname, try: if lbeditparse: lbeditparse = False raise unexception("editparse", None) glueunfile = GlueUnfile(xfil, undocname) if not glueunfile.tlcall: break # happens when it's a bitmap type, or communique if IsNotQuiet(): print glueunfile.sdate #, chairs gparas = GroupParas(glueunfile.tlcall, undocname, glueunfile.sdate, glueunfile.seccouncilmembers) except unexception, ux: assert not gparas if ux.description != "editparse": if bcontinueonerror: break print "\n\nError: %s on page %s textcounter %s" % ( ux.description, ux.paranum.pageno, ux.paranum.textcountnumber) print "\nHit RETURN to launch your editor on the pdfxml file (or type 's' to skip, or 't' to throw)" rl = sys.stdin.readline() if rl[0] == "s": break if rl[0] == "t": raise if ux.description != "editparse": fin = open(undocpdfxml, "r") finlines = fin.read() fin.close() mfinlines = re.match( "(?s)(.*?<text ){%d}" % ux.paranum.textcountnumber, finlines) ln = mfinlines.group(0).count("\n") else: ln = 1 #editor = os.getenv('EDITOR') if sys.platform == "win32": os.system('"C:\Program Files\ConTEXT\ConTEXT" %s /g00:%d' % (undocpdfxml, ln + 2)) else: os.system('vim "%s" +%d' % (undocpdfxml, ln + 2)) if not gparas: continue # actually write the file tmpfile = undochtml + "--temp" fout = open(tmpfile, "w") fout.write('<html>\n<head>\n') fout.write( '<link href="unview.css" type="text/css" rel="stylesheet" media="all">\n' ) fout.write('</head>\n<body>\n') fout.write('\n<div class="heading" id="pg000-bk00">\n') sdate, stime = glueunfile.sdate[:10], glueunfile.sdate[10:].strip() fout.write( '\t<span class="code">%s</span> <span class="date">%s</span> <span class="time">%s</span>' % (undocname, sdate, stime)) if gparas: fout.write('<span class="rosetime">%s</span>' % gparas[-1].rosetime) fout.write('\n</div>\n') if glueunfile.bSecurityCouncil: fout.write('\n<div class="council-agenda" id="pg000-bk01">\n') fout.write( '\t<p class="boldline-p" id="pg000-bk01-pa01">%s</p>\n' % glueunfile.agenda) fout.write('</div>\n') fout.write('\n<div class="council-attendees" id="pg000-bk02">\n') ichairn = 0 for chair in glueunfile.chairs: ichairn += 1 fout.write('\t<p id="pg000-bk02-pa%02d">' % ichairn) for chperson in chair[0].split( "/" ): # just for the extremely rare case we get two people sharing the seat fout.write('<span class="name">%s</span> ' % chperson.strip()) fout.write( '<span class="nation">%s</span> <span class="place">%s</span></p>\n' % (chair[1], chair[2])) fout.write('</div>') if glueunfile.bGeneralAssembly: fout.write('\n<div class="assembly-chairs" id="pg000-bk03">\n') ichairn = 0 for chair in glueunfile.chairs: ichairn += 1 fout.write( '\t<p id="pg000-bk03-pa%02d"><span class="name">%s</span> <span class="nation">%s</span> <span class="place">president</span></p>\n' % (ichairn, chair[0], chair[1])) fout.write('</div>\n') for gpara in gparas: gpara.writeblock(fout) # this for making the parsing a little easier fout.write('\n<div class="end-document" id="pg999-bk99">\n') fout.write('</div>\n') fout.write('\n</body>\n</html>\n') fout.close() if os.path.isfile(undochtml): os.remove(undochtml) os.rename(tmpfile, undochtml)
def GroupParas(tlcall, undocname, sdate, seccouncilmembers): res = [] i = 0 currentspeaker = None curragendanum = "" while i < len(tlcall): tlc = tlcall[i] if re.match(recvoterequest, tlc.paratext): lblock = VoteBlock(tlcall, i, undocname, sdate, seccouncilmembers) i = lblock.i # non-voting line to be processed else: speakerbeforetookchair = "" if (len(res) > 2) and (res[-1].typ in [ "italicline-tookchair", "italicline-spokein" ]) and (res[-2].typ == "spoken"): speakerbeforetookchair = res[-2].speaker if res[-1].typ == "italicline-spokein": assert len(res[-1].paragraphs) == 1 mspokein = re.search("spoke in (\w+)", res[-1].paragraphs[0][1]) if not mspokein: if IsNotQuiet(): print "unrecognized spokein", res[-1].paragraphs #print "converting spokein", speakerbeforetookchair[2], mspokein.group(1) speakerbeforetookchair = (speakerbeforetookchair[0], speakerbeforetookchair[1], mspokein.group(1), speakerbeforetookchair[3]) lblock = SpeechBlock(tlcall, i, undocname, sdate, speakerbeforetookchair, curragendanum) if lblock.agendanum: curragendanum = lblock.agendanum i = lblock.i if res and res[-1].paranum.pageno == lblock.paranum.pageno: lblock.paranum.blockno = res[-1].paranum.blockno + 1 else: lblock.paranum.blockno = 1 res.append(lblock) # find the rosetime if res: res[-1].rosetime = res[-1].ExtractRoseTime(sdate[10:].strip()) if undocname in [ "S-PV-3698", "S-PV-3698-Resu.1", "S-PV-3765-Resu.2", "S-PV-4072-Resu.1", "S-PV-4174", "S-PV-4223", "S-PV-5100" ]: assert not res[-1].rosetime res[-1].rosetime = sdate[10:].strip() # the missing rosetimes if not res[-1].rosetime: if undocname == "A-62-PV.79": res[-1].rosetime = "06:05" else: res[-1].writeblock(sys.stdout) raise unexception("can't find rosetime", res[-1].paranum) return res
def ScrapePDF(undocname, plenaryurl="http://www.un.org/ga/59/documentation/list0.html", purl=None, bforce=False): pdfname = undocname + ".pdf" pdffile = os.path.join(pdfdir, pdfname) if not bforce and os.path.isfile(pdffile): if IsNotQuiet(): print " skipping", pdffile, pdfname return True if not purl: mares = re.match("A-RES-(\d+)-(\d+)$", undocname) maresr = re.match("A-RES-(\d+)\((S-I|[IVXL]+)\)$", undocname) # resolutions used to have sessions in roman numerals meres = re.match("E-RES-(\d\d\d\d)-(\d+)$", undocname) # don't know what the code is madoc = re.match("A-(\d\d)-((?:L\.|CRP\.)?\d+)([\w\.\-\(\)]*)$", undocname) msres = re.match("S-RES-(\d+)\((\d+)\)$", undocname) mapv = re.match("A-(\d\d)-PV.(\d+)(-Corr.\d|)$", undocname) macdoc = re.match("A-AC.(\d+)-(\d\d\d\d)-(\d+)$", undocname) maodoc = re.match("A-(\d+)(-?[\w\.\-]*)$", undocname) mspv = re.match("S-PV.(\d+)(?:-Resu\.(\d+))?$", undocname) scdoc = re.match("S-(\d\d\d\d)-(\d+)(-Corr.\d|)(\(SUPP\)|)$", undocname) mscodoc = re.match("S-(\d+)(-?[\w\.\-]*)$", undocname) #stdoc = re.match("ST-SGB-(\d+)$", undocname) # experimental secretariat document dashdoc = re.match("ST-|A-C", undocname) munknown = re.match("(?:ECESA/1/Rev.1|S-26-2)$", undocname) mahrc = re.match("A-HRC(?:-S-(\d+))?-(\d[\w\.\-]*)$", undocname) mprst = re.match("S-PRST-(\d\d\d\d)-(\d+)$", undocname) if mares: if int(mares.group(1)) < 1: # limit the sessions we take these resolutions from return False purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/RES/%s/%s&Lang=E" % (mares.group(1), mares.group(2)) #if meres: # purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=E/RES/%s/%s&Lang=E" % (meres.group(1), meres.group(2)) elif maresr: if maresr.group(2) == "S-I": purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/RES/%s(S-1)&Lang=E" % (maresr.group(1)) else: purl = "http://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=A/RES/%s(%s)&Lang=E&Area=RESOLUTION" % (maresr.group(1), maresr.group(2)) elif dashdoc: # works for ST/SGB/... dashcode = re.sub("-", "/", undocname) #purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=ST/SGB/%s&Lang=E" % (stdoc.group(1)) purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=%s&Lang=E" % dashcode elif madoc: if int(madoc.group(1)) < 1: # limit the sessions we take these resolutions from return False tail = re.sub("-", "/", madoc.group(3)) purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/%s/%s%s&Lang=E" % (madoc.group(1), madoc.group(2), tail) #print purl elif macdoc: purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/AC.%s/%s/%s&Lang=E" % (macdoc.group(1), macdoc.group(2), macdoc.group(3)) elif scdoc: tail = re.sub("-", "/", scdoc.group(3)) purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/%s/%s%s%s&Lang=E" % (scdoc.group(1), scdoc.group(2), tail, scdoc.group(4)) elif mprst: purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/PRST/%s/%s&Lang=E" % (mprst.group(1), mprst.group(2)) elif msres: sarea = int(msres.group(1)) <= 766 and "RESOLUTION" or "UNDOC" purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/RES/%s%%20(%s)&Lang=E&Area=%s" % (msres.group(1), msres.group(2), sarea) plenaryurl = "http://www.un.org/Docs/scres/2002/sc2002.htm" elif mspv: tail = mspv.group(2) and ("(Resumption%s)" % mspv.group(2)) or "" purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/PV.%s%s&Lang=E" % (mspv.group(1), tail) plenaryurl = "http://www.un.org/Docs/scres/2002/sc2002.htm" elif mapv: #if int(mapv.group(1)) < 40: # limit the sessions we take these resolutions from # return False tail = re.sub("-", "/", mapv.group(3)) purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/%s/PV.%s%s&Lang=E" % (mapv.group(1), mapv.group(2), tail) elif maodoc: tail = re.sub("-", "/", maodoc.group(2)) purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/%s%s&Lang=E" % (maodoc.group(1), tail) print "oldstyle doc", purl elif mscodoc: tail = re.sub("-", "/", mscodoc.group(2)) purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/%s%s&Lang=E" % (mscodoc.group(1), tail) print "oldstyle doc", purl elif mahrc: tail = re.sub("-", "/", mahrc.group(2)) if mahrc.group(1): purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/HRC/S-%s/%s&Lang=E" % (mahrc.group(1), tail) else: purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/HRC/%s&Lang=E" % tail print "human rights council", purl elif meres or munknown: if IsNotQuiet(): print "Unknown undocname", undocname return False else: if IsNotQuiet(): print "Unrecognized undocname", undocname return False else: purl = re.sub("\s", "", purl) purl = re.sub("&", "&", purl) #print "$$%s$$" % purl if IsNotQuiet(): print " scraping", undocname, if not purl: print "*** Need to make" return False ##return False # first go through the forwarding blocker purl = urlparse.urljoin(plenaryurl, purl) try: if IsNotQuiet(): print purl plenarypdf = GetFromNet(undocname, purl, plenaryurl) if not plenarypdf: purlsupp = re.sub("&Lang=E", "(SUPP)&Lang=E", purl) if purlsupp != purl: plenarypdf = GetFromNet(undocname, purlsupp, plenaryurl) #http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/61/5/Add.1(SUPP)&Lang=E except KeyboardInterrupt, e: print "\n *** Keyboard Interrupt" sys.exit(1);
def ExtractSeccounFrontPage(self, txlines): self.date = None self.chairs = [] self.seccouncilmembers = [] self.agenda = [] lasttop = -1 jtxlines = [] ih = 0 while ih < len(txlines): if txlines[ih].top == lasttop: jtxlines[-1] = "%s %s" % (jtxlines[-1], txlines[ih].ltext) else: jtxlines.append(txlines[ih].ltext) lasttop = txlines[ih].top ih += 1 del txlines # just deletes the reference to this object ih = 0 while ih < len(jtxlines): self.ExtractDateTime(None, jtxlines[ih]) mpresseat = re.match( "<i>(President|Chairman|later)(?:</i>:|:\s*</i>)\s*((?:Mr.|Mrs.|Ms.|Sir\.?|Miss|Sheikh|Baroness|Lord|Nana) .*?)\s+\.(?: \.)*\s*(\(.*)?$", jtxlines[ih]) #print jtxlines[ih], mpresseat if mpresseat: if not self.date: if IsNotQuiet(): for i in range(ih): print jtxlines[i] raise unexception( "missingg date", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) if mpresseat.group(1) in ["President", "Chairman"]: assert len(self.chairs) == 0 # first one else: assert len(self.chairs) == 1 # later president ih += 1 if mpresseat.group(3): scountry = mpresseat.group(3) else: scountry = "" if re.search("\(", scountry) and not re.search("\)", scountry): scountry = "%s %s" % (scountry, jtxlines[ih]) ih += 1 mcountry = re.match("\((.*?)\)$", scountry) lfscountry = re.sub("\s+", " ", mcountry.group(1)) fscountry = FixNationName(lfscountry, self.date) if not fscountry: if IsNotQuiet(): print "--%s--" % mcountry.group(1) raise unexception( "unrecognized nationA", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) chairname = re.sub("\s\s+", " ", mpresseat.group(2)).strip() self.chairs.append((chairname, fscountry, "president")) if fscountry in self.seccouncilmembers: assert len(self.seccouncilmembers) == 1 assert fscountry == "New Zealand" assert self.undocname == "S-PV-3370" assert len(self.chairs) == 2 del self.chairs[0] del self.seccouncilmembers[0] self.seccouncilmembers.append(fscountry) continue mcountryseat = re.match( "(<i>Members(?:</i>:|:\s*</i>))?\s*([\w\-\s]*?)\s*\.(?: \.)*\s*((?:Mr.|Ms.|Mrs.|Miss|Dr.|Sir\.?|Sheikh|Baroness|Lord|Nana) [^<>]*|absent)$", jtxlines[ih]) if mcountryseat: if mcountryseat.group(1): if len(self.chairs) not in [ 1, 2 ]: # in case of second president if IsNotQuiet(): print self.chairs, "chchchch" raise unexception( "chairs not thereB", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) else: if len(self.chairs) == 0: if not self.date: # prob a closed meeting break if IsNotQuiet(): print ih, jtxlines[ih] raise unexception( "seat without chair", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) lfscountry = re.sub("\s+", " ", mcountryseat.group(2)) fscountry = FixNationName(lfscountry, self.date) if not fscountry: if IsNotQuiet(): print "--%s--" % mcountryseat.group(2) raise unexception( "unrecognized nationB", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) chairname = re.sub("\s\s+", " ", mcountryseat.group(3)).strip() self.chairs.append((chairname, fscountry, "member")) if fscountry not in self.seccouncilmembers: self.seccouncilmembers.append(fscountry) else: if IsNotQuiet(): print "Repeat-country on council", fscountry else: if re.search(" \. \. \. \. \. \. ", jtxlines[ih]): if IsNotQuiet(): print "--%s--" % jtxlines[ih] raise unexception( "missing country", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) if re.match("<b>Agenda\s*</b>$", jtxlines[ih]): ih += 1 break if re.search("Agenda", jtxlines[ih]): print ih, jtxlines raise unexception( "unextracted Agenda (should be <b>?)", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) ih += 1 # could be a closed meeting if not self.date: alltext = " ".join(jtxlines) if re.search( "OFFICIAL COMMUNIQU..*?Held in private (?:in the Security Council Chamber )?at Headquarters(?i)", alltext): return False return True while ih < len(jtxlines): if re.match("\d\d-\d\d", jtxlines[ih]): break if re.match("\d\d.?\d\d\d\d\d \(E\)", jtxlines[ih]): break if re.match( "This record contains the text of speeches delivered in English", jtxlines[ih]): break #print "agagag", jtxlines[ih] assert not re.search("text of speeches|verbatim(?i)", jtxlines[ih]) self.agenda.append(jtxlines[ih].strip()) ih += 1 #print "ccccc", self.chairs lparanum = paranumC(self.undocname, None, 0, -1, self.textcountnumber) if len(self.chairs) not in (15, 17) or len(self.seccouncilmembers) != 15: if self.undocname == "S-PV-3446": return False if IsNotQuiet(): print len(self.seccouncilmembers), len( self.chairs ), "wrong number of members or chairs\n", self.chairs print self.seccouncilmembers raise unexception("wrongnumber on council", lparanum) self.agenda = " ".join(self.agenda) self.agenda = re.sub("</?b>", " ", self.agenda) self.agenda = re.sub("\s\s+", " ", self.agenda) self.agenda = MarkupLinks( CleanupTags(self.agenda, "council-agenda", lparanum), self.undocname, lparanum) return True
def __init__(self, xpage, lundocname, lpageno, textcountnumber): self.pageno = lpageno self.undocname = lundocname self.textcountnumber = textcountnumber self.bSecurityCouncil = re.match("S-PV.(\d+)", self.undocname) self.nSecurityCouncilSession = self.bSecurityCouncil and int( self.bSecurityCouncil.group(1)) or 0 self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname) assert self.bSecurityCouncil or self.bGeneralAssembly # for right column, if not left justified, this adds a bit more to the right if self.bGeneralAssembly and int( re.match("A-(\d+)", lundocname).group(1)) <= 52: rightcolstartindentincrement = 1 else: rightcolstartindentincrement = 0 # set the column starts from some of the special cases we get leftcolstart = 90 if self.bGeneralAssembly and int( re.match("A-(\d+)", lundocname).group(1)) <= 54: rightcolstart = 481 else: rightcolstart = 468 if lundocname in [ "A-54-PV.100", "A-54-PV.96", "A-54-PV.98", "A-54-PV.99", "S-PV-4143", "S-PV-4143-Resu.1" ]: rightcolstart = 468 elif lundocname in ["A-54-PV.97"]: rightcolstart = 486 elif re.match("S-PV-335[0-8]", lundocname): rightcolstart = 468 elif re.match("S-PV-334", lundocname): rightcolstart = 468 elif self.nSecurityCouncilSession >= 4144: rightcolstart = 468 #re.match("S-PV-414[4-9]", lundocname): # rightcolstart = 468 #elif re.match("S-PV-41[5-9]", lundocname): # rightcolstart = 468 #elif re.match("S-PV-4[2-9]", lundocname): # rightcolstart = 468 #elif re.match("S-PV-5", lundocname): # rightcolstart = 468 elif self.bSecurityCouncil: rightcolstart = 481 rightcolstartindentincrement = 1 # generate the list of lines, sorted by vertical position ftxlines = re.findall("<text.*?</text>", xpage) txlines = [] for txline in ftxlines: txl = TextLine(txline, lundocname, lpageno, self.textcountnumber) self.textcountnumber += 1 if txl.ltext: if txlines and txlines[-1].bfootertype and txlines[ -1].top == txl.top: txl.bfootertype = True txlines.append(txl) txlines.sort(key=TextLineTopKey) # the half divider is at 459 # try to separate out the header and footers if self.pageno == 1 and self.bGeneralAssembly: ih = self.ExtractDotLineChairHead(txlines) #for Dtxl in txlines[-10:]: # print Dtxl.top, Dtxl.left, Dtxl.ltext ie = len(txlines) - 1 while txlines[ie].bfootertype: #print "FOOTER:", txlines[ie].ltext ie -= 1 #print "**NON-FOOTER:", txlines[ie].ltext ie += 1 # the whole first page gets parsed separately assert not self.bSecurityCouncil elif self.bSecurityCouncil and self.pageno == 1: if not self.ExtractSeccounFrontPage(txlines): self.bSecurityCouncil = "ClosedSession" return # special case where the agenda spills to a second page (don't forget the outer application of this if) elif self.bSecurityCouncil and lundocname in twopageagendas and self.pageno == 2: ih = 0 self.agenda = [] while ih < len(txlines): if 132 <= txlines[ih].top < 1000: self.agenda.append(txlines[ih].ltext) ih += 1 self.agenda = " ".join(self.agenda) self.agenda = re.sub("</?b>", " ", self.agenda) self.agenda = re.sub("\s\s+", " ", self.agenda) lparanum = paranumC(self.undocname, None, 0, -1, self.textcountnumber) self.agenda = MarkupLinks( CleanupTags(self.agenda, "council-agenda", lparanum), self.undocname, lparanum) return elif self.bGeneralAssembly: if re.match("<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[0].ltext): ih = 1 elif re.match("\d", txlines[0].ltext) and re.match( "<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[1].ltext): ih = 2 else: #print txlines[0].ltext assert re.match("General Assembly", txlines[0].ltext), txlines[0].ltext assert re.match("\d+(?:th|st|nd|rd) (?:plenary )?meeting", txlines[1].ltext) assert re.match("\S+ [Ss]ession", txlines[2].ltext) assert re.match("\d+ \w+ \d\d\d\d", txlines[3].ltext) or ( lundocname in ["A-50-PV.38", "A-50-PV.40"]) ih = 4 ie = len(txlines) - 1 if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext): ie -= 1 pagenumtext = re.sub("<..?>", "", txlines[ie].ltext).strip() if re.match("\d\d\-\d\d\d\d\d", txlines[ie - 1].ltext): ie -= 1 if not re.match("\d+$", pagenumtext): if IsNotQuiet(): print "jjjj", pagenumtext, txlines[ie].ltext raise unexception( "pagenum error not a number", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) if int(pagenumtext) != self.pageno: if IsNotQuiet(): print pagenumtext, self.pageno raise unexception( "pagenum serror of speaker-intro", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) elif self.bSecurityCouncil: #if len(txlines) < 4: # raise unexception("intro too short", paranumC(self.undocname, None, 0, -1, txlines[0].textcountnumber)) bl0 = len(txlines) > 4 and re.match("Security Council", txlines[0].ltext) bl1 = len(txlines) > 4 and re.match( "\d+(?:th|st|nd|rd)? (?:\(Resumption(?: \d)?\) )?(?:meeting)?", txlines[1].ltext) bl2 = len(txlines) > 4 and re.match("(\w+-\w+|\w+) [Yy]ear", txlines[2].ltext) bl3 = len(txlines) > 4 and re.match("\d+ \w+ \d\d\d\d", txlines[3].ltext) bl4 = re.match( "<b>S/PV.\d+\s*(?:\(Resumption [\d|I]\)|\(Part [I]+\))?\s*</b>", txlines[0].ltext) bl4r = (self.undocname[5:] >= "4143") if bl4 and bl4r: ih = 1 elif bl0 and bl1 and bl2 and bl3: ih = 4 else: if IsNotQuiet(): print "\nFirst four lines on page:", self.pageno, bl4, bl4r print bl0, txlines[0].ltext print bl1, txlines[1].ltext print bl2, txlines[2].ltext print bl3, txlines[3].ltext print bl4, bl4r raise unexception( "bad page header", paranumC(self.undocname, None, 0, -1, txlines[0].textcountnumber)) ie = len(txlines) - 1 if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext): ie -= 1 pagenumtext = txlines[ie].ltext mpagenumtext = re.match("(?:<b>)?(\d+)\s*(?:</b>)?$", pagenumtext) if not mpagenumtext: if IsNotQuiet(): print "jkjk", pagenumtext raise unexception( "pagenum error not a number", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) pgoffset = int(mpagenumtext.group(1)) - self.pageno if pgoffset != 0 and self.undocname not in misnumberedpages: if IsNotQuiet(): print "pagenum-offset not in list", self.undocname, mpagenumtext.group( 1), self.pageno raise unexception( "page pagenum error of speaker-intro", paranumC(self.undocname, None, 0, -1, txlines[ie].textcountnumber)) if re.match("\d\d-\d\d\d\d\d$", txlines[ie - 1].ltext): ie -= 1 else: assert False # separate out the header and footers self.txlheader = txlines[:ih] self.txlfooter = txlines[ie:] # separate the body into the two columns self.txlcol1 = [] self.txlcol2 = [] self.minindentleft = 9999 self.minindentright = 9999 for txl in txlines[ih:ie]: if txl.left < 459: #print txl.bfootertype, txl.left, txl.width, txl.top, txl.ltext # zzzz # there's a bit of spilling out where the region is larger than it should be for the words as in A-56-PV.64 if not (txl.left + txl.width <= 459): if txl.left + txl.width > 501: if IsNotQuiet(): print txl.left, txl.width, txl.left + txl.width print txl.ltext print "might have page no. 1 on first page (or add to twopageagendas)" raise unexception( "right-hand extension excessive", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) if not (txl.left <= 165): bc = -1 while True: assert self.txlcol1[-1].txls[ bc].top == txl.top # in-line but shorter if (self.txlcol1[-1].txls[bc].left <= 165): break bc -= 1 txl.indent = txl.left - leftcolstart if txl.indent < 0: if IsNotQuiet(): print txl.indent, txl.ltext raise unexception( "negative indentation", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) self.minindentleft = min(txl.indent, self.minindentleft) txl.brightcol = False AppendToCluster(self.txlcol1, txl) else: txl.indent = txl.left - rightcolstart if txl.indent != 0: txl.indent += rightcolstartindentincrement if txl.indent < 0: if IsNotQuiet(): print txl.indent, txl.left, rightcolstart print txl.ltext raise unexception( "negative indent on righthand column", paranumC(self.undocname, None, 0, -1, self.textcountnumber)) self.minindentright = min(txl.indent, self.minindentright) txl.brightcol = True AppendToCluster(self.txlcol2, txl)
def ScrapeSCContentsPage(year, contentsurl): if IsNotQuiet(): print "URL index:", contentsurl fin = urllib2.urlopen(contentsurl) scindex = fin.read() fin.close() reslist = [ ] pvlist = [ ] prstlist = [ ] scdoclist = [ ] pvcorrlist = [ ] # this gets everything except the press releases in the middle column scindexlist = re.findall('<a.[^>]*?href=\s*"(http://daccess[^"]*)"[^>]*>\s*(?:<font size="2">)?(.*?)(?:<br>\s*)?</a>(?is)', scindex) for sci in scindexlist: #print sci[1] # communique for an embargoed verbatim recording if re.match("Communiqu.", sci[1]): pvlm12 = re.sub("&", "&", pvlist[-1][2]) if sci[0] != pvlm12 and sci[0] != pvlist[-1][2] and not re.search("PV.5794|PV.5906", sci[0]): print "Communique doesn't have same link as it should:\n%s\n%s" % (sci[0], pvlist[-1][2]) # same link continue # security council resolutions scres = re.match("S/RES/(\d+)\s*\((\d+)\)\s*$", sci[1]) if scres: reslist.append((-int(scres.group(1)), scres, sci[0])) continue # verbatim recordings scpv = re.match("S/PV\.(\d+)(?:\s*<br>)?(?:\s*\((Resumption|Part)\s*([\dI]*)\))?\s*(?:\(closed\))?$", sci[1]) if scpv: pvlist.append((-int(scpv.group(1) or "1"), scpv, sci[0])) #print scpv.group(0) continue # corrigenda, which happens to the verbatim transcripts sccorr = re.match("Corr\.(\d+)\s*", sci[1]) if sccorr: urlwithoutcorr = re.sub("/Corr\.\d+(?i)", "", sci[0]) if pvlist[-1][2] != urlwithoutcorr and IsNotQuiet(): print pvlist[-1][2] print urlwithoutcorr if year == 1998 and re.search("PV\.3896", pvlist[-1][2]) and re.search("PV\.3986", urlwithoutcorr): print " --- known typo" elif year == 1995 and re.search("PV\.3528", pvlist[-1][2]) and re.search("PV\.3611", urlwithoutcorr): print " --- known typo" elif year == 2008 and re.search("PV\.5916", pvlist[-1][2]) and re.search("PV\.5916", urlwithoutcorr): print " --- known unconsolidated typo" else: print year, pvlist[-1], sci assert False pvcorrlist.append((pvlist[-1][1], sccorr.group(1), sci[0])) continue # presidential statements scprst = re.match("S/PRST/(\d+)/(\d+)\s*$", sci[1]) if scprst: assert int(scprst.group(1)) == year prstlist.append((-int(scprst.group(2)), scprst, sci[0])) continue # security council documents (usually a failed resolution) scdoc = re.match("\(?S/(\d+)/(\d+)\)?\s*$", sci[1]) if scdoc: assert int(scdoc.group(1)) == year scdoclist.append((-int(scdoc.group(2)), scdoc, sci[0])) continue # known typo link if re.match("<a>", sci[1]): assert sci[0] == pvlist[-1][2] # same link continue if IsNotQuiet(): print "Unrecognized link type", "$$%s$$" % sci[1] assert False # sort and scrape all the presidential statements prstlist.sort() for i in range(1, len(prstlist)): if -prstlist[i - 1][0] - 1 != -prstlist[i][0] and IsNotQuiet(): print "presidential statement missing between ", -prstlist[i - 1][0], "and", -prstlist[i][0] if (year, -prstlist[i - 1][0], -prstlist[i][0]) in [(2000, 28, 26), (1996, 11, 9), (1996, 4, 2), (1995, 57, 55), (1995, 37, 35), (1995, 15, 13), (1995, 4, 2), (1994, 77, 75), (1994, 50, 48), (1994, 42, 39), (1994, 25, 23), (1994, 19, 17), (1994, 4, 2)]: print " -- known missing statement" else: assert False for (i, prst, prsturl) in prstlist: ScrapePDF("S-PRST-%s-%s" % (prst.group(1), prst.group(2)), plenaryurl=contentsurl, purl=prsturl) # now sort and scrape all the verbatims pvlist.sort() for i in range(1, len(pvlist)): #print pvlist[i - 1][1].group(2), pvlist[i - 1][1].group(3) if -pvlist[i - 1][0] == -pvlist[i][0]: if pvlist[i - 1][1].group(2) == "Resumption": resum = int(pvlist[i][1].group(3) or "0") if not pvlist[i - 1][1].group(2): if IsNotQuiet(): print "rrr", pvlist[i - 1][1].group(0) # there must be a resumption number if not pvlist[i - 1][1].group(3): resumP = 1 elif pvlist[i - 1][1].group(3) == "I": resumP = 1 else: resumP = int(pvlist[i - 1][1].group(3)) assert resumP == resum + 1 else: if IsNotQuiet(): print "slslsl", pvlist[i - 1][1].group(2), pvlist[i][1].group(2) elif -pvlist[i - 1][0] - 1 != -pvlist[i][0]: if IsNotQuiet(): print "verbatim report missing between ", -pvlist[i - 1][0], "and", -pvlist[i][0] assert False for (i, scpv, scpvurl) in pvlist: resumppart = "" if scpv.group(2) == "Resumption": if scpv.group(3) == "I": resnum = 1 else: resnum = int(scpv.group(3)) resumppart = "-Resu.%d" % resnum elif scpv.group(2) == "Part": if scpv.group(3) == "I": pn = "1" elif scpv.group(3) == "II": pn = "2" else: if IsNotQuiet(): print "asspv", scpv.group(0), scpv.group(3) resumppart = "-Part.%s" % pn ScrapePDF("S-PV-%s%s" % (scpv.group(1), resumppart), plenaryurl=contentsurl, purl=scpvurl) # do corrigendas for (scpv, pvcorr, pvcorrurl) in pvcorrlist: ScrapePDF("S-PV-%s%s-Corr.%s" % (scpv.group(1), (scpv.group(2) and ("-Resu.%s" % scpv.group(3)) or ""), pvcorr), plenaryurl=contentsurl, purl=pvcorrurl) # now sort and scrape all the resolutions reslist.sort() for i in range(1, len(reslist)): if -reslist[i - 1][0] - 1 != -reslist[i][0]: if IsNotQuiet(): print "resolution missing between ", -reslist[i - 1][0], "and", -reslist[i][0] assert False for (i, scres, scresurl) in reslist: ScrapePDF("S-RES-%s(%s)" % (scres.group(1), scres.group(2)), plenaryurl=contentsurl, purl=scresurl)
def __init__(self, xfil, undocname): self.sdate = None self.chairs = None self.agenda = None self.tlcall = None self.seccouncilmembers = None self.bSecurityCouncil = re.match("S-PV.\d+", undocname) self.bGeneralAssembly = re.match("A-\d+-PV", undocname) xpages = StripPageTags(xfil, undocname) if not xpages: return # bitmap type encountered txpages = [] self.tlcall = [] for i in range(len(xpages)): txpage = TextPage(xpages[i], undocname, i + 1, (txpages or 0) and txpages[-1].textcountnumber) if i == 0 and txpage.bSecurityCouncil == "ClosedSession": if IsNotQuiet(): print " -- closedsession" self.tlcall = None return # closed session encountered txpages.append(txpage) if txpage.bSecurityCouncil and i == 0: continue # special cases of agenda overflowing into two pages if txpage.bSecurityCouncil and i == 1 and undocname in twopageagendas: txpages[0].agenda = "%s %s" % ( txpages[0].agenda, txpage.agenda ) # ram it all into one paragraph (who cares) continue bmissingcolumns = undocname in ["A-61-PV.106", "A-52-PV.39"] if txpage.txlcol1: AppendCluster(self.tlcall, txpage.txlcol1[0], "newpage") for tlc in txpage.txlcol1[1:]: AppendCluster(self.tlcall, tlc, "gapcluster") elif not bmissingcolumns: #assert i == len(xpages) - 1 # only last page can have missing columns (sometimes it's the first) print "page", i, "of", len(xpages) #print txpages[-1].textcountnumber raise unexception( "missing column not on last page", paranumC(undocname, None, 0, -1, txpages[-1].textcountnumber)) # have had a case where the first column was the blank one if txpage.txlcol2: AppendCluster(self.tlcall, txpage.txlcol2[0], "newcolumn") for tlc in txpage.txlcol2[1:]: AppendCluster(self.tlcall, tlc, "gapcluster") elif not bmissingcolumns: assert i == len(xpages) - 1, "%d != %d" % (i, len(xpages) - 1) # assign ids to the clusters self.sdate = txpages[0].date paranumlast = paranumC(undocname, self.sdate, 0, -1, 0) for tlc in self.tlcall: if tlc.txls[0].pageno == paranumlast.pageno: paranumlast = paranumC(undocname, self.sdate, paranumlast.pageno, paranumlast.paragraphno + 1, tlc.txls[0].textcountnumber) else: paranumlast = paranumC(undocname, self.sdate, tlc.txls[0].pageno, 1, tlc.txls[0].textcountnumber) tlc.paranum = paranumlast # merge the lines together and remove double bold/italics that happen across lines for tlc in self.tlcall: jparatext = [] # don't insert spaces where there is a hyphen for txl in tlc.txls: if jparatext and not (re.search("\w[-/]$", jparatext[-1]) and re.match("\w", txl.ltext)): jparatext.append(" ") jparatext.append(txl.ltext) tlc.paratext = "".join(jparatext) tlc.paratext = re.sub("-</i> <i>", "-", tlc.paratext) tlc.paratext = re.sub("-</b> <b>", "-", tlc.paratext) tlc.paratext = re.sub("</b>\s*\.\s*<b>", ". ", tlc.paratext) tlc.paratext = re.sub("Secretary- General", "Secretary-General", tlc.paratext) tlc.paratext = re.sub( "\s*(?:</i>\s*<i>|</b>\s*<b>|<b>\s*</b>|<i>\s*</i>|<b>\s*<i>\s*</b>\s*</i>)\s*", " ", tlc.paratext) tlc.paratext = tlc.paratext.strip() tlc.paratext = re.sub( "^<b>(The(?: Acting)? Co-Chairperson) \(([^\)]*)\)\s*(?:</b>\s*:|:\s*</b>)", "<b>\\1</b> (\\2):", tlc.paratext) tlc.lastindent = tlc.indents[-1][0] self.agenda = txpages[0].agenda self.chairs = txpages[0].chairs if self.bSecurityCouncil: self.seccouncilmembers = txpages[0].seccouncilmembers
def GetFromNet(undocname, purl, plenaryurl): req = urllib2.Request(purl) req.add_header('Referer', plenaryurl) fin = urllib2.urlopen(req) plenrefererforward = fin.read() fin.close() mfore = re.search('URL=([^"]*)', plenrefererforward) if not mfore: if undocname == "A-55-PV.26": # claims to be embargoed if IsNotQuiet(): print "broken", pdfname return False if re.search("There is no document", plenrefererforward): #print "no-document" return False if re.search("This document is under EMBARGO", plenrefererforward): if IsNotQuiet(): print "*** EMBARGOED ***" return False if re.search("The distribution of the document is to hight", plenrefererforward): if IsNotQuiet(): print "*** TO HIGHT ***" return False if not IsNotQuiet(): # bail out without error return False print "plplplpl", plenrefererforward assert False turl = urlparse.urljoin(purl, mfore.group(1)) # pull in the login url, containing another forward, and a page which gives the cookies fin = urllib2.urlopen(turl) plenarycookielink = fin.read() fin.close() #<META HTTP-EQUIV="refresh" CONTENT="1; URL=http://daccessdds.un.org/doc/UNDOC/GEN/N02/596/08/PDF/N0259608.pdf?OpenElement"> #<frame name="footer" scrolling="no" noresize target="main" src="http://daccessdds.un.org/prod/ods_mother.nsf?Login&Username=freeods2&Password=1234" marginwidth="0" marginheight="0"> # extract pdf link mpdf = re.search('URL=([^"]*)', plenarycookielink) if not mpdf: if not IsNotQuiet(): # bail out without error return False print "pcpcpcpc", plenarycookielink plenarypdfurl = urlparse.urljoin(turl, mpdf.group(1)) # extract cookie link mcook = re.search('src="(http://daccessdds.un.org/[^"]*)', plenarycookielink) if not mcook: if not IsNotQuiet(): # bail out without error return False print "plplsplspl", plenarycookielink plenarycookurl = urlparse.urljoin(turl, mcook.group(1)) # take the cookies from the cookie link cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) fin = opener.open(plenarycookurl) fin.close() if IsNotQuiet(): print plenarypdfurl[-30:] # put them into the pdf link fin = opener.open(plenarypdfurl) plenarypdf = fin.read() fin.close() return plenarypdf
def __init__(self, tlcall, i, lundocname, lsdate, seccouncilmembers): self.tlcall = tlcall self.i = i self.sdate = lsdate self.undocname = lundocname self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname) self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname) assert self.bGeneralAssembly or self.bSecurityCouncil if not self.bSecurityCouncil: seccouncilmembers = None self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip() if self.bGeneralAssembly and re.match( "A recorded vote has been requested(?: for this item| on (?:the|this) motion|\. We shall now begin the voting process)?\.?$", vtext): self.i += 1 vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip() if self.bGeneralAssembly and re.match( "A recorded vote was taken\s*\.?$", vtext): self.i += 1 if self.bSecurityCouncil and re.match( "A vote was taken(?: by (?:a )?show of hands)?.$", vtext): self.i += 1 if not (self.i != i or self.undocname in ["A-55-PV.86", "A-50-PV.90", "A-49-PV.90"]): print "--%s--" % tlcall[self.i - 1].paratext if not re.match("<i>", tlcall[self.i - 1].paratext): print " --[should this line be italic?]" print tlcall[self.i].paratext raise unexception("requested vote not followed through", tlcall[self.i].paranum) self.vlfavour = self.DetectVote("<i>In favour:?\s*</i>:?") self.vlagainst = self.DetectVote("(?:<i>)?Against:?\s*(?:</i>)?:?") self.vlabstain = self.DetectVote("(?:<i>)?Abstaining:?(?:</i>)?:?") gnv, self.vlabsent = GenerateNationsVoteList(self.vlfavour, self.vlagainst, self.vlabstain, self.sdate, self.paranum, seccouncilmembers) self.votecount = "favour=%d against=%d abstain=%d absent=%d" % (len( self.vlfavour), len(self.vlagainst), len( self.vlabstain), len(self.vlabsent)) if IsNotQuiet(): print " ", self.votecount if self.bGeneralAssembly: self.DetectAdoption() self.DetectSubsequentVoteChange(gnv) if self.bSecurityCouncil: self.motiontext = "" self.DetectDidnotparticipate(gnv, self.vlabsent) #res = [ '\t\t<div style="border:1px solid black; margin-left:2em"><b>VOTE ', votecount, "</b><br>\n", "\t\t<i>", self.motiontext, "</i>\n" ] #res.append('\t\t<div style="font-size:6">') lvotelist = [] for nation, vote in sorted(gnv.items()): lvotelist.append('<span class="%s">%s</span>' % (vote, nation)) self.votelist = ", ".join(lvotelist) #res.append("</div></div>\n") #self.parafout = "".join(res) self.typ = "vote"
def process_file(pfnameunindexed, xapian_db): mdocid = re.match(r".*?(html[\\/])([\-\d\w\.]+?)(\.unindexed)?(\.html)$", pfnameunindexed) assert mdocid, "unable to match: %s" % pfnameunindexed document_id = mdocid.group(2) fin = open(pfnameunindexed) doccontent = fin.read() fin.close() mdocument_date = re.search('<span class="date">(\d\d\d\d-\d\d-\d\d)</span>', doccontent) assert mdocument_date, "not found date in file %s" % pfnameunindexed document_date = mdocument_date.group(1) if IsNotQuiet(): print "indexing %s %s" % (document_id, document_date) while delete_all_for_doc(document_id, xapian_db): pass # keep calling delete until all clear # Loop through each speech, and batch up the headings so they can be updated with the correct info xapian_doc_heading = None sdiv_headingdata = "NOHEADINGSET" # kills off the assertion that happens later. we can get a "meeting began" before the first heading xapian_doc_subheading = None sdiv_subheadingdata = None headingtermsubheading = set() headingtermheading = set() lastend = 0 tdocument_id, gasssess = thinned_docid(document_id) docterms = set() docterms.add("D%s" % document_id) docterms.add("E%s" % document_date[:4]) # year docterms.add("E%s" % document_date[:7]) # year+month docterms.add("E%s" % document_date) # full date #if document_date > "2001-09-11": # docterms.add("Epost911") # "9/11 changed everything" if gasssess: docterms.add("Zga") docterms.add("Zga%s" % gasssess) else: docterms.add("Zsc") mdivs = re.finditer('^<div class="([^"]*)" id="([^"]*)"(?: agendanum="([^"]*)")?[^>]*>(.*?)^</div>', doccontent, re.S + re.M) for mdiv in mdivs: # used to dereference the string as it is in the file div_class = mdiv.group(1) div_data = (document_id, mdiv.start(), mdiv.end() - mdiv.start(), mdiv.group(2)) xapian_doc = MakeBaseXapianDoc(mdiv, tdocument_id, document_date, headingtermsubheading) for dterm in docterms: xapian_doc.add_term(dterm) if div_class == "heading": assert not xapian_doc_heading, "Only one heading per document" xapian_doc_heading = xapian_doc sdiv_headingdata = div_data # the data put into a xapian object is: speech | document-id | offset | length | heading-id containing this speech | length of full section if this is a heading elif div_class in ["subheading", "end-document"]: assert xapian_doc_heading if xapian_doc_subheading: for hterm in headingtermsubheading: xapian_doc_subheading.add_term(hterm) dsubheadingdata = "%s|%s|%d|%d|%s|%d" % (sdiv_subheadingdata[3], sdiv_subheadingdata[0], sdiv_subheadingdata[1], sdiv_subheadingdata[2], sdiv_headingdata[3], lastend - sdiv_subheadingdata[1]) xapian_doc_subheading.set_data(dsubheadingdata) xapian_db.add_document(xapian_doc_subheading) headingtermheading.update(headingtermsubheading) if div_class == "subheading": headingtermsubheading.clear() xapian_doc_subheading = xapian_doc sdiv_subheadingdata = div_data else: headingtermsubheading = None xapian_doc_subheading = None sdiv_subheadingdata = None if div_class == "end-document": for hterm in headingtermheading: xapian_doc_heading.add_term(hterm) dheadingdata = "%s|%s|%d|%d|%s|%d" % (sdiv_headingdata[3], sdiv_headingdata[0], sdiv_headingdata[1], sdiv_headingdata[2], "", lastend - sdiv_headingdata[1]) xapian_doc_heading.set_data(dheadingdata) xapian_db.add_document(xapian_doc_heading) xapian_doc_heading = None else: assert div_class in ["assembly-chairs", "council-agenda", "council-attendees", "spoken", "italicline", "italicline-tookchair", "italicline-spokein", "recvote", "boldline"], "unknown divclass:%s" % div_class assert sdiv_subheadingdata or sdiv_headingdata ddata = "%s|%s|%d|%d|%s|" % (div_data[3], div_data[0], div_data[1], div_data[2], (sdiv_subheadingdata or sdiv_headingdata)[3]) xapian_doc.set_data(ddata) xapian_db.add_document(xapian_doc) lastend = mdiv.end() # the end-document tag helps us close these headings off assert not xapian_doc_subheading and not xapian_doc_heading # Note that the document has been indexed xapian_db.flush() if mdocid.group(3): # unindexed pfnameindexed = re.sub(r"\.unindexed", "", pfnameunindexed) if os.path.exists(pfnameindexed): os.unlink(pfnameindexed) #print pfnameunindexed, pfnameindexed os.rename(pfnameunindexed, pfnameindexed)
def __init__(self, tlcall, i, lundocname, lsdate, speakerbeforetookchair, prevagendanum): self.tlcall = tlcall self.i = i self.sdate = lsdate self.undocname = lundocname self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname) if not self.bSecurityCouncil: self.genasssess = re.match("A-(\d+)", self.undocname).group(1) self.agendanum = "" self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum # paranum = ( undocname, sdate, tlc.txls[0].pageno, paranumber ) #self.gid = self.paranum.MakeGid() tlc = self.tlcall[self.i] #print "\npppp", tlc.indents, tlc.paratext, tlc.txls ptext, self.typ, self.speaker = DetectSpeaker(tlc.paratext, tlc.indents, self.paranum, speakerbeforetookchair) ptext = MarkupLinks(CleanupTags(ptext, self.typ, self.paranum), self.undocname, self.paranum) self.i += 1 if self.typ in [ "italicline", "italicline-tookchair", "italicline-spokein" ]: self.paragraphs = [("italicline", ptext)] return # series of boldlines if self.typ == "boldline": self.agendanum = "" blinepara = tlc.lastindent and "blockquote" or "p" # detect the agenda if not self.bSecurityCouncil: self.agendanum = DetectAgendaForm(ptext, self.genasssess, prevagendanum, self.paranum) #print "aaaaa ", self.agendanum if not self.agendanum: if IsNotQuiet(): print "if no agenda, add to AgendaTypeMap" raise unexception(" uncategorized agenda title", self.paranum) self.paragraphs = [(blinepara, ptext)] while self.i < len(self.tlcall): tlc = self.tlcall[self.i] if not re.match(reboldline, tlc.paratext): break ptext = MarkupLinks( CleanupTags(tlc.paratext, self.typ, self.paranum), self.undocname, self.paranum) # a second agenda number gets found if not self.bSecurityCouncil and re.match( "Agenda(?: item)? \d+(?i)", ptext): agendanum2 = DetectAgendaForm(ptext, self.genasssess, prevagendanum, self.paranum) print "agendanum from second line", agendanum2 assert agendanum2, ptext # must detect it if re.search("misc|show|address", self.agendanum): self.agendanum = agendanum2 # a woolly agenda can be over-ridden elif self.undocname == "A-62-PV.74": self.agendanum = "%s,%s" % (self.agendanum, agendanum2) else: print self.agendanum print ptext raise unexception(" unknown extra agendanum case", self.paranum) print "aaaa2aa ", self.agendanum self.paragraphs.append((tlc.lastindent and "boldline-indent" or "boldline-p", ptext)) self.i += 1 return # actual spoken section assert self.typ == "spoken" assert tlc.lastindent == 0 or len( tlc.indents) == 1 # doesn't happen in first paragraph of speech self.paragraphs = [("p", ptext)] while self.i < len(self.tlcall): tlc = self.tlcall[self.i] if self.DetectEndSpeech(tlc.paratext, tlc.lastindent, self.sdate): break ptext = MarkupLinks( CleanupTags(tlc.paratext, self.typ, self.paranum), self.undocname, self.paranum) bIndent = (len(tlc.indents) == 1) and ( tlc.indents[0][0] != 0) and (tlc.indents[0][1] > 1) self.paragraphs.append(((bIndent and "blockquote" or "p"), ptext)) self.i += 1
def DetectSpeaker(ptext, indents, paranum, speakerbeforetookchair): #print ptext, "\n\n\n" if re.match("<i>(?:In favour|Against|Abstaining)", ptext): # should be part of a voteblock print ptext #print tlcall[i - 1].paratext assert False if re.match( "(?:The agenda was adopted\.|A vote was taken by show of hands\.|There being no objection, it is so decided\.)$", ptext): if IsNotQuiet(): print "italicizingline", len(indents), ptext ptext = "<i>%s</i>" % ptext indentationerror = "" if len(indents) == 1 and indents[0][0] == 0: if not re.match("<b> ", ptext) and not re.match( "(?:\(|<i>)+spoke in", ptext ): # often there is a speaker with a blank space at the front indentationerror = "unindented-paragraph" if len(indents) > 2: indentationerror = "too many different indents" if len(indents) == 2 and indents[1][0] != 0: if (indents[0][1] == 1 and ptext[0] == '"' and indents[0][0] - indents[1][0] > 30): # turn this into a blockquote indents[0] = (indents[0][0], indents[0][1] + indents[1][1], indents[0][2] + indents[1][2]) del indents[1] if IsNotQuiet(): pass #print "ququququq", indents else: indentationerror = "un-left-justified paragraph" mfixchinaspek = re.match( "<b>(Mr\. \w+)\s*</b>\s*([\w\-]+)\s*\((?:China|Republic of Korea)\)", ptext) if mfixchinaspek: #print "fixing chinaspeak", ptext, "\n" ptext = "<b>%s %s</b> %s" % (mfixchinaspek.group(1), mfixchinaspek.group(2), ptext[mfixchinaspek.end(2):]) #print ptext if re.search("\s\S\s\S\s\S\s", ptext): print ptext raise unexception("probable gaps in text", paranum) mspek = re.match(respekp1, ptext) if not mspek: mspek = re.match(respekp2, ptext) if not mspek: mspek = re.match(respekp3, ptext) if not mspek: mspek = re.match(respek, ptext) assert not mspek or not re.search("[<>]", mspek.group(1)) if not mspek and re.match("<[ib]>", ptext): speakerbeforetookchair = "" if mspek or speakerbeforetookchair: if indentationerror == "unindented-paragraph" and speakerbeforetookchair: indentationerror = False if indentationerror == "unindented-paragraph" and paranum.undocname in [ "A-55-PV.60", "A-55-PV.63", "A-55-PV.64", "A-55-PV.68", "A-55-PV.59", "A-55-PV.44", "A-55-PV.46", "A-55-PV.48", "A-55-PV.49", "A-55-PV.52", "A-55-PV.56", "A-55-PV.51", "A-60-PV.37", "A-60-PV.38", "A-60-PV.42", "A-60-PV.51", "A-60-PV.79", "A-60-PV.85", "A-60-PV.91", "A-60-PV.86", "A-60-PV.87", "A-60-PV.92", "A-60-PV.93", "A-60-PV.94" ]: indentationerror = False if indentationerror: print ptext print indents raise unexception(indentationerror + " of speaker-intro", paranum) if respekSS and not mspek: m = re.match(respekSS, ptext) if IsNotQuiet(): print ptext print " ___ ", m and m.group(0) if mspek: assert not indentationerror assert not re.match("<i>", ptext) speakr = re.sub("\s+", " ", mspek.group(1).strip()) nation = "" bIsNotnation = True lnation = mspek.group(2) mbumpnation = re.search("([^(]*?)\s*\(([^)]*)\)$", speakr) if mbumpnation and not lnation and FixNationName( mbumpnation.group(2), paranum.sdate): speakr = mbumpnation.group(1) lnation = mbumpnation.group(2) if IsNotQuiet(): print "BBBB bumpingnat", speakr, lnation if lnation: nation = IsPrenation(lnation, paranum.sdate) if not nation: nation = FixNationName(lnation, paranum.sdate) bIsNotnation = not nation if not nation: nation = IsNonnation(lnation, paranum.sdate) if not nation: print ptext print "\ncheck if misspelt or new nonnation, can add * to front of it: ", lnation raise unexception("unrecognized nationC or nonnation", paranum) elif not re.match( "The(?: Acting| Temporary)? President|The(?: Deputy| Assistant)? Secretary-General|The(?: Acting)? Chairman|Transcript", speakr): if IsNotQuiet(): # allow for less strict when done by cronjob raise unexception("missing nation for %s" % speakr, paranum) if not re.match( "Mr\.|Mrs\.|Miss |Ms\.|Pope |The |King |Sultan |Prince |Secretary|Arch|Dr\.|Sir |Sheikh?a? |President |Monsignor |Chairman |Crown |His |Dame |Senator |Cardinal |Chief |Captain |Acting |Begum |Major-General |Shaikh |Judge |Count |Emir |Baroness |General |Nana |Princess |U |Rev\. |Kofi |Sayyid |Sheika |Bishop |Sir. |Wilmot |Eliza |Jos|Lord |Justice |Father |Commodore |Metropolitan |Transcript|Madam ", speakr): print speakr raise unexception("improper title on speaker", paranum) if re.search("[\.,:;]$", speakr): print speakr raise unexception("improper tail on speaker", paranum) if re.search("[,:;\(\)]", speakr): print speakr raise unexception("improper contents in speaker", paranum) typ = "spoken" currentspeaker = (speakr, nation, (mspek.group(5) or ""), bIsNotnation ) # name, nation, language #print currentspeaker ptext = ptext[mspek.end(0):] if re.search("</b>", ptext): print ptext raise unexception("bold in spoken text", paranum) elif speakerbeforetookchair: assert not indentationerror typ = "spoken" currentspeaker = speakerbeforetookchair #print "Continuation speaker", speakerbeforetookchair # non-spoken text else: #<b>Mr. Al-Mahmoud </b>(Qatar) (<i>spoke in Arabic</i>): if re.match("<b>.*?(?:</b>.*?:|:</b>)(?!</b>$)", ptext): print ptext raise unexception("improperly detected spoken text", paranum) if re.match("\(?<i>", ptext): mballots = re.search("Number of ballot papers", ptext) if mballots: #print "BALLOT:", ptext, "\n" indentationerror = False if indentationerror: print ptext print indents raise unexception(indentationerror + " of unspoken text", paranum) if not mballots: mptext = re.match( "<i>(.*?)</i>\.?\s*(?:\((?:resolutions?|decision|draft resolution) (A?[\d/]*\s*(?:\(?[A-Z,\s]*(?:and|to) [A-Z]\)?|[A-Z]{1,2})?)\))?\.?$", ptext) if not mptext and not re.match("\(<i>spoke in", ptext): print "--%s--" % ptext raise unexception("improper italicline", paranum) ptext = re.sub("</?[ib]>", "", ptext).strip() # further parsing of these phrases may take place in due course msodecided = re.match( "(?:There being no objection, )?[Ii]t (?:was|is) so decided(?: \(decision [\d/]*\s*(?:A|B|C|A and B)?\))?\.?$", ptext) mwasadopted = re.match( ".*?(?:resolution|decision|agenda|amendment|recommendation).*?(?:was|were) adopted(?i)", ptext) mcalledorder = re.match( "The meeting (?:was called to order|rose|was suspended|was adjourned|resumed|was resumed) (?:at|on)", ptext) mtookchair = re.match( "\s*(?:In the absence of the President, )?(.*?)(?:, \(?Vice[\-\s]President\)?,)? (?:took|in) the [Cc]hair\.?$", ptext) mretchair = re.match( "(?:The President|.*?, Vice-President,|Mrs. Albright.*?|Baroness Amos) (?:returned to|in) the Chair.$", ptext) mescort = re.search( "(?:was escorted|escorted the.*?) (?:(?:from|to) the (?:rostrum|podium|platform)|(?:from|into|to its place in) the (?:General Assembly Hall|Conference Room|Security Council Chamber))(?: by the President and the Secretary-General)?\.?$", ptext) msecball = re.search( "A vote was taken by secret ballot\.(?: The meeting was suspended at|$)", ptext) mminsil = re.search( "The (?:members of the (?:General )?Assembly|Council) observed (?:a|one) minute of (?:silent prayer (?:or|and) meditation|silence)\.$", ptext) mtellers = re.search( "At the invitations? of the (?:Acting )?Presidents?.*?acted as tellers\.$|Having been drawn by lot", ptext) melected = re.search( "[Hh]aving obtained (?:the required (?:two-thirds )?|an absolute )majority.*?(?:(?:were|was|been|is) s?elected|will be included [io]n the list)", ptext) mmisc = re.search( "The Acting President drew the following.*?from the box|sang.*?for the General Assembly|The Assembly heard a musical performance|The Secretary-General presented the award to|From the .*? Group:|Having been drawn by lot by the (?:President|Secretary-General),|were elected members of the Organizational Committee|President \w+ and then Vice-President|Vice-President \S+ \S+ presided over|The following .*? States have.*?been elected members of the Security Council", ptext) mmiscnote = re.search("\[In the 79th plenary .*? III.\]$", ptext) mmstar = re.match("\*", ptext) # insert * in the text mmspokein = re.match( "\(spoke in \w+(?:; interpretation.*?|; .*? the delegation)?\)$", ptext) matinvite = re.match( "(?:At the invitation of the President, )?.*? (?:(?:took (?:a |the )?|were escorted to their )seats? at the Council table|(?:took|was invited to take) (?:(?:the |a |their )?(?:seat|place)s? reserved for \w+|a seat|a place|places|seats|their seats|his seat) at the (?:side of the )?Council (?:[Cc]hamber|table))(?:;.*?Chamber)?\.$", ptext) mscsilence = re.match( "The members of the (?:Security )?Council observed a minute of silence.$", ptext) mscescort = re.search( "(?:were|was) escorted to (?:seats|a seat|his place|a place) at the (?:Security )?Council table.$", ptext) mvtape = re.match( "A video ?(?:tape)? was (?:shown|played|displayed) in the Council Chamber.$|An audio tape, in Arabic,|The members of the General Assembly heard a musical performance.$", ptext) mvprojscreen = re.match( "(?:An image was|Two images were|A video was) projected on screen\.$", ptext) mvresuadjourned = re.match( "The meeting was resumed and adjourned on.*? a\.m\.$", ptext) if mmstar: ptext = ptext[1:] # first line is from general assembly. Second line adds in some from security council if not (msodecided or mwasadopted or mcalledorder or mtookchair or mretchair or mballots or mescort or msecball or mminsil or mtellers or mmisc or melected or mmstar or mmiscnote or mmspokein or \ mvprojscreen or matinvite or mscsilence or mscescort or mvtape or mvresuadjourned): print "unrecognized--%s--" % ptext print re.match("At the invitations? of the (?:Acting )?", ptext) raise unexception("unrecognized italicline", paranum) # we can add subtypes to these italic-lines typ = "italicline" if mtookchair or mretchair: typ = "italicline-tookchair" if mmspokein: typ = "italicline-spokein" currentspeaker = None elif re.match("<b>", ptext): if not re.match(reboldline, ptext): print ptext raise unexception("unrecognized bold completion", paranum) ptext = re.sub("</?b>", "", ptext).strip() typ = "boldline" currentspeaker = None else: typ = "unknown" print ptext, indents raise unexception("possible indent failure", paranum) return ptext, typ, currentspeaker
def AppendCluster(res, tlc, sclusttype): # check if we should merge to the next paragraph assert sclusttype in ["gapcluster", "newpage", "newcolumn"] if res and sclusttype != "gapcluster" and len(tlc.indents) == 1: indentp = res[-1].indents[-1][0] indentn = tlc.indents[0][0] bbothindented = ((indentp in [31, 32]) and (indentn in [31, 32])) or \ ((indentp in [0, 1]) and (indentn in [0, 1])) or \ ((indentp in [36, 33]) and (indentp == indentn)) bonelineparacont = (len(res[-1].indents) == 1) and (res[-1].indents[0][1] == 1) and ( indentp in [31, 32]) and (indentn in [0, 1]) td0 = res[-1].txls[-1].ltext[:3] td1 = tlc.txls[0].ltext[:3] if not re.match("<[ib]>", td0): td0 = "" if not re.match("<[ib]>", td1): td1 = "" bstylematches = (td0 == td1) #assert not (bbothindented and not bstylematches) if re.match("<i>In favour", tlc.txls[0].ltext): bstylematches = False if re.match("<b>Agenda", res[-1].txls[-1].ltext): bstylematches = False # likely continuation of paragraph if bbothindented and bstylematches: res[-1].txls.extend(tlc.txls) #print tlc.txls[0].ltext return else: if bonelineparacont: if IsNotQuiet(): pass #print "checkthiscontinuation case" #print indentp, indentn, bstylematches, bonelineparacont, res[-1].indents #print " ----", tlc.txls[0].ltext if bstylematches: if IsNotQuiet(): pass #print "merging" res[-1].txls.extend(tlc.txls) return # new cluster; check the indenting pattern is good if len(tlc.indents) == 2: if tlc.indents[0] <= tlc.indents[1]: #print tlc.indents, tlc.txls[0].ltext #assert re.match("<[ib]>.*?</[ib]>", tlc.txls[0].ltext) # <i>In favour:</i> pass # two paragraphs may have been merged, try to separate them out elif len(tlc.indents) == 4 and tlc.indents[0][0] == tlc.indents[2][ 0] and tlc.indents[1][0] == tlc.indents[3][0]: if IsNotQuiet(): pass #print tlc.indents assert tlc.indents[0][0] == tlc.indents[2][0] assert tlc.indents[1][0] == tlc.indents[3][0] si = tlc.indents[0][2] + tlc.indents[1][2] tlcf = TextLineCluster(None) tlcf.txls = tlc.txls[:si] del tlc.txls[:si] tlcf.indents = tlc.indents[:2] del tlc.indents[:2] res.append(tlcf) if IsNotQuiet(): pass #print "# paragraphs", si #print " ", tlc.txls[0].ltext #print tlcf.indents, tlc.indents elif len(tlc.indents) != 1: if IsNotQuiet(): print tlc.indents, "jjjj" prevtop = -1 for txl in tlc.txls: if IsNotQuiet(): if prevtop == txl.top: print " ", print txl.indent, txl.ltext prevtop = txl.top raise unexception( "unrecognized indent pattern", paranumC(txl.undocname, None, 0, -1, txl.textcountnumber)) assert False res.append(tlc) return
def __init__(self, sdate, docid, subheadingid, agendanumstr, titletext): self.sdate = sdate self.docid = docid mdocid = re.match("A-(\d\d)-PV\.(\d+)$", docid) assert mdocid, docid self.nsession = int(mdocid.group(1)) self.nmeeting = int(mdocid.group(2)) # so we can sort by it self.sortval = (self.nsession, self.nmeeting) self.subheadingid = subheadingid self.agendanumstr = re.sub("^condolence-.*$", "condolence", agendanumstr) #if self.agendanumstr[0] == "c": # print self.agendanumstr, "kkkk" self.agendanums = [] for agendanum in agendanumstr.split(","): sa = agendanum.split("-") assert len(sa) == 2 assert int(sa[1]) == self.nsession if sa[0] == "condolence": self.agendanums.append(sa[0]) else: self.agendanums.append(agendanum) # break the agenda text up by paragraph self.titletext = titletext self.titlelines = re.findall( "<(?:p|blockquote)[^>]*>(.*?)\.?</(?:p|blockquote)>", titletext) # loop forwards to remove agenda items as highest priority i = 0 while i < len(self.titlelines): # remove the agenda items title parts magmatch = rfrontcomm.match(self.titlelines[i]) if magmatch: if magmatch.end(0) == len(self.titlelines[i]): if len(self.titlelines) > 1: del self.titlelines[i] continue else: self.titlelines[i] = self.titlelines[i][ magmatch.end(0):].capitalize() i += 1 # loop backwards and trim as much as possible from each row of text for i in range(len(self.titlelines) - 1, -1, -1): # remove trailing references to documents in parentheses mtraildoc = re.search(rtraildoc, self.titlelines[i]) if mtraildoc: if mtraildoc.start(0) == 0: if len(self.titlelines) > 1: del self.titlelines[i] continue else: self.titlelines[i] = self.titlelines[i][:mtraildoc.start(0 )] # remove trailing references to reports ": report of the Fifth Committee (Part III)" while True: # recurse mtrailcommrep = rtrailcomm.search(self.titlelines[i]) if not mtrailcommrep: break if mtrailcommrep.start(0) == 0: if len(self.titlelines) > 1: del self.titlelines[i] continue else: self.titlelines[i] = self.titlelines[i][:mtrailcommrep. start(0)] # remove entire lines that are generic mgenerline = rgenerline.search(self.titlelines[i]) if mgenerline and len(self.titlelines) > 1: del self.titlelines[i] continue # substitutions for substm, substr in substs: self.titlelines[i] = re.sub(substm, substr, self.titlelines[i]) if re.search("agenda item(?i)", self.titlelines[0]) and IsNotQuiet(): print "Poss bad agenda item", self.titlelines assert self.titlelines assert not re.match("\s*$", self.titlelines[0]), self.titletext