Beispiel #1
0
    def ExtractDateTime(self, txline, ltext):
        # extract the date out if poss
        mdate = re.match(
            "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?\s*m\.?| noon\.?)?(?: \(closed\))?$",
            ltext)
        if not mdate:  #Tuesday, 3 December 2002, 10 a.m.
            if re.search("Friday", ltext) and IsNotQuiet():
                print ltext, re.match(
                    "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?m\.?| noon\.?)?(?: \(closed\))?",
                    ltext)
            return

        #print txlines[ih].ltext
        iday = int(mdate.group(1))
        if mdate.group(2) not in months:
            if IsNotQuiet():
                print mdate.group(2), months
            raise unexception(
                "unrecognized month",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        imonth = months.index(mdate.group(2))
        syear = mdate.group(3)
        if not re.match("(?:20\d\d|19\d\d)$", syear):
            raise unexception(
                "bad year",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        ihour = int(mdate.group(4))
        imin = mdate.group(5) and int(mdate.group(5)) or 0
        if mdate.group(6) and mdate.group(6) == "a" and ihour == 12:
            ihour = 0
        elif mdate.group(6) and mdate.group(6) == "p" and ihour != 12:
            ihour += 12
        if self.date:
            raise unexception(
                "date redefined",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        if not (0 <= ihour <= 23) or not (0 <= imin <= 59):
            if IsNotQuiet():
                print ltext
            raise unexception(
                "bad time",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        self.date = "%s-%02d-%02d %02d:%02d" % (syear, imonth + 1, iday, ihour,
                                                imin)
Beispiel #2
0
    def ExtractDotLineChair(self, txlines, ih):
        assert self.pageno == 1
        #<text top="334" left="185" width="584" height="17" font="2">Mr.  Kavan  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . (Czech Republic)</text>
        while True:
            #print "------" + txlines[ih].ltext
            mchair = re.search("([^>:]*?)\s*\. \. \. \. \.", txlines[ih].ltext)
            if mchair:
                break

            # fix missing year date
            #if self.undocname == "A-55-PV.44" and txlines[ih].ltext == "Monday, 30 October, 10 a.m.":
            #    txlines[ih].ltext = "Monday, 30 October 2000, 10 a.m."
            self.ExtractDateTime(txlines[ih], txlines[ih].ltext)

            ih += 1
            if ih == len(txlines):
                return -1

        if not self.date:
            if IsNotQuiet():
                for i in range(ih):
                    print "--%s--" % txlines[i].ltext
            raise unexception(
                "dotlinechair date problem",
                paranumC(txlines[ih].undocname, None, 0, -1,
                         txlines[ih].textcountnumber))
            assert False

        # when country name for the president . . . . is not on same line
        mcountry = re.search("\((.*?)\)$", txlines[ih].ltext)
        if not mcountry:
            ih += 1
            #print txlines[ih].ltext
            mcountry = re.match("\((.*?)\)$", txlines[ih].ltext)
            if not mcountry:
                if IsNotQuiet():
                    print txlines[ih].ltext
                raise unexception(
                    "unable to extract country from  ...-line",
                    paranumC(txlines[ih].undocname, None, 0, -1,
                             txlines[ih].textcountnumber))
        ih += 1
        chairname = re.sub("\s\s+", " ", mchair.group(1)).strip()
        self.chairs.append(
            (chairname, FixNationName(mcountry.group(1), self.date)))
        return ih
Beispiel #3
0
    def __init__(self, txline, lundocname, lpageno, textcountnumber):
        mxline = re.match(
            '<text top="(\d+)" left="(\d+)" width="-?(\d+)" height="(\d+)" font="(\d+)">(.*?)</text>',
            txline)
        if not mxline:
            print txline, "tttttt"
        self.top = int(mxline.group(1))
        self.left = int(mxline.group(2))
        self.width = int(mxline.group(3))
        self.height = int(mxline.group(4))
        self.font = int(mxline.group(5))
        self.pageno = lpageno
        self.undocname = lundocname
        self.textcountnumber = textcountnumber
        self.ltext = mxline.group(6).strip()

        self.ltext = re.sub("<i>\s*</i>|<b>\s*</b>", " ", self.ltext)
        if re.match("<[ib]>\s*</[ib]>|\s*$", self.ltext):
            self.ltext = ""

        # will be removed
        if not self.ltext:
            return

        self.bfootertype = (self.left < 459
                            and self.left + self.width > 459) or re.match(
                                footertext, self.ltext)
        #if self.bfootertype:
        #    print self.ltext

        # move on any short bits that are like 13^(th)
        if self.height == 11 and not self.bfootertype and self.width <= 10:
            #print self.left, self.width, "'%s'" % self.ltext
            assert self.width <= 10
            if self.ltext not in ["th", "rd", "st", "nd"]:
                if IsNotQuiet():
                    print self.ltext
                raise unexception(
                    "unrecognized shortbit",
                    paranumC(self.undocname, None, 0, -1,
                             self.textcountnumber))
            self.top += 2  # push the step down from 16 to 18
Beispiel #4
0
def AppendToCluster(txlcol, txl):

    # frig the indentation on the most common mistakes
    if re.match(
            "<i>The meeting (?:was called|was suspended|rose at|was resumed)",
            txl.ltext) and (txl.indent == 0):
        txl.indent = 31

    if not txlcol:
        txlcol.append(TextLineCluster(txl))
        return
    txl.vgap = txl.top - txlcol[-1].txls[-1].top

    #print txlcol[-1].txls[-1].ltext
    #print txl.vgap, txl.width, txl.height, txl.top,  txl.ltext  # zzzz

    # frig vgaps in some cases where the spacing was wider than normal
    if txl.undocname in ["A-50-PV.84", "A-50-PV.88"]:
        if txl.vgap == 21 or txl.vgap == 22:
            txl.vgap = 18
        if txl.vgap == 42:
            txl.vgap = 43
    if txl.undocname == "S-PV-5584":
        if txl.vgap == 20:
            txl.vgap = 19

    if not txl.vgap in familiarvgaps:
        if IsNotQuiet():
            print "\n\n   vgap=", txl.vgap, "\n\nwidth/height/top", txl.width, txl.height, txl.top, txl.ltext  # zzzz
            print " familiar vgaps:", familiarvgaps
        raise unexception(
            "vgap not familiar",
            paranumC(txl.undocname, None, 0, -1, txl.textcountnumber))
    if txl.vgap in (0, 17, 18, 19) or txl.vgap == 0:
        txlcol[-1].AddLine(txl)
    else:
        #print txl.vgap, "vvvv", txl.ltext
        txlcol.append(TextLineCluster(txl))
Beispiel #5
0
    def __init__(self, xfil, undocname):
        self.sdate = None
        self.chairs = None
        self.agenda = None
        self.tlcall = None
        self.seccouncilmembers = None
        self.bSecurityCouncil = re.match("S-PV.\d+", undocname)
        self.bGeneralAssembly = re.match("A-\d+-PV", undocname)

        xpages = StripPageTags(xfil, undocname)
        if not xpages:
            return  # bitmap type encountered
        txpages = []
        self.tlcall = []

        for i in range(len(xpages)):
            txpage = TextPage(xpages[i], undocname, i + 1, (txpages or 0)
                              and txpages[-1].textcountnumber)
            if i == 0 and txpage.bSecurityCouncil == "ClosedSession":
                if IsNotQuiet():
                    print " -- closedsession"
                self.tlcall = None
                return  # closed session encountered
            txpages.append(txpage)

            if txpage.bSecurityCouncil and i == 0:
                continue

            # special cases of agenda overflowing into two pages
            if txpage.bSecurityCouncil and i == 1 and undocname in twopageagendas:
                txpages[0].agenda = "%s %s" % (
                    txpages[0].agenda, txpage.agenda
                )  # ram it all into one paragraph (who cares)
                continue

            bmissingcolumns = undocname in ["A-61-PV.106", "A-52-PV.39"]
            if txpage.txlcol1:
                AppendCluster(self.tlcall, txpage.txlcol1[0], "newpage")
                for tlc in txpage.txlcol1[1:]:
                    AppendCluster(self.tlcall, tlc, "gapcluster")
            elif not bmissingcolumns:
                #assert i == len(xpages) - 1  # only last page can have missing columns (sometimes it's the first)
                print "page", i, "of", len(xpages)
                #print txpages[-1].textcountnumber
                raise unexception(
                    "missing column not on last page",
                    paranumC(undocname, None, 0, -1,
                             txpages[-1].textcountnumber))

            # have had a case where the first column was the blank one
            if txpage.txlcol2:
                AppendCluster(self.tlcall, txpage.txlcol2[0], "newcolumn")
                for tlc in txpage.txlcol2[1:]:
                    AppendCluster(self.tlcall, tlc, "gapcluster")
            elif not bmissingcolumns:
                assert i == len(xpages) - 1, "%d != %d" % (i, len(xpages) - 1)

        # assign ids to the clusters
        self.sdate = txpages[0].date
        paranumlast = paranumC(undocname, self.sdate, 0, -1, 0)
        for tlc in self.tlcall:
            if tlc.txls[0].pageno == paranumlast.pageno:
                paranumlast = paranumC(undocname, self.sdate,
                                       paranumlast.pageno,
                                       paranumlast.paragraphno + 1,
                                       tlc.txls[0].textcountnumber)
            else:
                paranumlast = paranumC(undocname, self.sdate,
                                       tlc.txls[0].pageno, 1,
                                       tlc.txls[0].textcountnumber)
            tlc.paranum = paranumlast

        # merge the lines together and remove double bold/italics that happen across lines
        for tlc in self.tlcall:
            jparatext = []  # don't insert spaces where there is a hyphen
            for txl in tlc.txls:
                if jparatext and not (re.search("\w[-/]$", jparatext[-1])
                                      and re.match("\w", txl.ltext)):
                    jparatext.append(" ")
                jparatext.append(txl.ltext)
            tlc.paratext = "".join(jparatext)

            tlc.paratext = re.sub("-</i> <i>", "-", tlc.paratext)
            tlc.paratext = re.sub("-</b> <b>", "-", tlc.paratext)
            tlc.paratext = re.sub("</b>\s*\.\s*<b>", ". ", tlc.paratext)
            tlc.paratext = re.sub("Secretary- General", "Secretary-General",
                                  tlc.paratext)
            tlc.paratext = re.sub(
                "\s*(?:</i>\s*<i>|</b>\s*<b>|<b>\s*</b>|<i>\s*</i>|<b>\s*<i>\s*</b>\s*</i>)\s*",
                " ", tlc.paratext)
            tlc.paratext = tlc.paratext.strip()

            tlc.paratext = re.sub(
                "^<b>(The(?: Acting)? Co-Chairperson) \(([^\)]*)\)\s*(?:</b>\s*:|:\s*</b>)",
                "<b>\\1</b> (\\2):", tlc.paratext)
            tlc.lastindent = tlc.indents[-1][0]

        self.agenda = txpages[0].agenda
        self.chairs = txpages[0].chairs
        if self.bSecurityCouncil:
            self.seccouncilmembers = txpages[0].seccouncilmembers
Beispiel #6
0
    def __init__(self, xpage, lundocname, lpageno, textcountnumber):
        self.pageno = lpageno
        self.undocname = lundocname
        self.textcountnumber = textcountnumber
        self.bSecurityCouncil = re.match("S-PV.(\d+)", self.undocname)
        self.nSecurityCouncilSession = self.bSecurityCouncil and int(
            self.bSecurityCouncil.group(1)) or 0
        self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname)
        assert self.bSecurityCouncil or self.bGeneralAssembly

        # for right column, if not left justified, this adds a bit more to the right
        if self.bGeneralAssembly and int(
                re.match("A-(\d+)", lundocname).group(1)) <= 52:
            rightcolstartindentincrement = 1
        else:
            rightcolstartindentincrement = 0

        # set the column starts from some of the special cases we get
        leftcolstart = 90
        if self.bGeneralAssembly and int(
                re.match("A-(\d+)", lundocname).group(1)) <= 54:
            rightcolstart = 481
        else:
            rightcolstart = 468

        if lundocname in [
                "A-54-PV.100", "A-54-PV.96", "A-54-PV.98", "A-54-PV.99",
                "S-PV-4143", "S-PV-4143-Resu.1"
        ]:
            rightcolstart = 468
        elif lundocname in ["A-54-PV.97"]:
            rightcolstart = 486
        elif re.match("S-PV-335[0-8]", lundocname):
            rightcolstart = 468
        elif re.match("S-PV-334", lundocname):
            rightcolstart = 468
        elif self.nSecurityCouncilSession >= 4144:
            rightcolstart = 468

        #re.match("S-PV-414[4-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-41[5-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-4[2-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-5", lundocname):
        #    rightcolstart = 468

        elif self.bSecurityCouncil:
            rightcolstart = 481
            rightcolstartindentincrement = 1

        # generate the list of lines, sorted by vertical position
        ftxlines = re.findall("<text.*?</text>", xpage)

        txlines = []
        for txline in ftxlines:
            txl = TextLine(txline, lundocname, lpageno, self.textcountnumber)
            self.textcountnumber += 1
            if txl.ltext:
                if txlines and txlines[-1].bfootertype and txlines[
                        -1].top == txl.top:
                    txl.bfootertype = True
                txlines.append(txl)
        txlines.sort(key=TextLineTopKey)

        # the half divider is at 459

        # try to separate out the header and footers
        if self.pageno == 1 and self.bGeneralAssembly:
            ih = self.ExtractDotLineChairHead(txlines)
            #for Dtxl in txlines[-10:]:
            #    print Dtxl.top, Dtxl.left, Dtxl.ltext

            ie = len(txlines) - 1
            while txlines[ie].bfootertype:
                #print "FOOTER:", txlines[ie].ltext
                ie -= 1
            #print "**NON-FOOTER:", txlines[ie].ltext
            ie += 1

            # the whole first page gets parsed separately
            assert not self.bSecurityCouncil

        elif self.bSecurityCouncil and self.pageno == 1:
            if not self.ExtractSeccounFrontPage(txlines):
                self.bSecurityCouncil = "ClosedSession"
            return

        # special case where the agenda spills to a second page (don't forget the outer application of this if)
        elif self.bSecurityCouncil and lundocname in twopageagendas and self.pageno == 2:
            ih = 0
            self.agenda = []
            while ih < len(txlines):
                if 132 <= txlines[ih].top < 1000:
                    self.agenda.append(txlines[ih].ltext)
                ih += 1
            self.agenda = " ".join(self.agenda)
            self.agenda = re.sub("</?b>", " ", self.agenda)
            self.agenda = re.sub("\s\s+", " ", self.agenda)
            lparanum = paranumC(self.undocname, None, 0, -1,
                                self.textcountnumber)
            self.agenda = MarkupLinks(
                CleanupTags(self.agenda, "council-agenda", lparanum),
                self.undocname, lparanum)
            return

        elif self.bGeneralAssembly:
            if re.match("<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[0].ltext):
                ih = 1
            elif re.match("\d", txlines[0].ltext) and re.match(
                    "<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[1].ltext):
                ih = 2
            else:
                #print txlines[0].ltext
                assert re.match("General Assembly",
                                txlines[0].ltext), txlines[0].ltext
                assert re.match("\d+(?:th|st|nd|rd) (?:plenary )?meeting",
                                txlines[1].ltext)
                assert re.match("\S+ [Ss]ession", txlines[2].ltext)
                assert re.match("\d+ \w+ \d\d\d\d", txlines[3].ltext) or (
                    lundocname in ["A-50-PV.38", "A-50-PV.40"])
                ih = 4
            ie = len(txlines) - 1
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext):
                ie -= 1
            pagenumtext = re.sub("<..?>", "", txlines[ie].ltext).strip()
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie - 1].ltext):
                ie -= 1
            if not re.match("\d+$", pagenumtext):
                if IsNotQuiet():
                    print "jjjj", pagenumtext, txlines[ie].ltext
                raise unexception(
                    "pagenum error not a number",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            if int(pagenumtext) != self.pageno:
                if IsNotQuiet():
                    print pagenumtext, self.pageno
                raise unexception(
                    "pagenum serror of speaker-intro",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))

        elif self.bSecurityCouncil:
            #if len(txlines) < 4:
            #    raise unexception("intro too short", paranumC(self.undocname, None, 0, -1, txlines[0].textcountnumber))

            bl0 = len(txlines) > 4 and re.match("Security Council",
                                                txlines[0].ltext)
            bl1 = len(txlines) > 4 and re.match(
                "\d+(?:th|st|nd|rd)? (?:\(Resumption(?: \d)?\) )?(?:meeting)?",
                txlines[1].ltext)
            bl2 = len(txlines) > 4 and re.match("(\w+-\w+|\w+) [Yy]ear",
                                                txlines[2].ltext)
            bl3 = len(txlines) > 4 and re.match("\d+ \w+ \d\d\d\d",
                                                txlines[3].ltext)

            bl4 = re.match(
                "<b>S/PV.\d+\s*(?:\(Resumption [\d|I]\)|\(Part [I]+\))?\s*</b>",
                txlines[0].ltext)
            bl4r = (self.undocname[5:] >= "4143")

            if bl4 and bl4r:
                ih = 1
            elif bl0 and bl1 and bl2 and bl3:
                ih = 4
            else:
                if IsNotQuiet():
                    print "\nFirst four lines on page:", self.pageno, bl4, bl4r
                    print bl0, txlines[0].ltext
                    print bl1, txlines[1].ltext
                    print bl2, txlines[2].ltext
                    print bl3, txlines[3].ltext
                    print bl4, bl4r
                raise unexception(
                    "bad page header",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[0].textcountnumber))

            ie = len(txlines) - 1
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext):
                ie -= 1
            pagenumtext = txlines[ie].ltext
            mpagenumtext = re.match("(?:<b>)?(\d+)\s*(?:</b>)?$", pagenumtext)
            if not mpagenumtext:
                if IsNotQuiet():
                    print "jkjk", pagenumtext
                raise unexception(
                    "pagenum error not a number",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            pgoffset = int(mpagenumtext.group(1)) - self.pageno
            if pgoffset != 0 and self.undocname not in misnumberedpages:
                if IsNotQuiet():
                    print "pagenum-offset not in list", self.undocname, mpagenumtext.group(
                        1), self.pageno
                raise unexception(
                    "page pagenum error of speaker-intro",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            if re.match("\d\d-\d\d\d\d\d$", txlines[ie - 1].ltext):
                ie -= 1

        else:
            assert False

        # separate out the header and footers
        self.txlheader = txlines[:ih]
        self.txlfooter = txlines[ie:]

        # separate the body into the two columns
        self.txlcol1 = []
        self.txlcol2 = []
        self.minindentleft = 9999
        self.minindentright = 9999
        for txl in txlines[ih:ie]:
            if txl.left < 459:
                #print txl.bfootertype, txl.left, txl.width, txl.top, txl.ltext  # zzzz
                # there's a bit of spilling out where the region is larger than it should be for the words as in A-56-PV.64
                if not (txl.left + txl.width <= 459):
                    if txl.left + txl.width > 501:
                        if IsNotQuiet():
                            print txl.left, txl.width, txl.left + txl.width
                            print txl.ltext
                            print "might have page no. 1 on first page (or add to twopageagendas)"
                        raise unexception(
                            "right-hand extension excessive",
                            paranumC(txl.undocname, None, 0, -1,
                                     txl.textcountnumber))
                    if not (txl.left <= 165):
                        bc = -1
                        while True:
                            assert self.txlcol1[-1].txls[
                                bc].top == txl.top  # in-line but shorter
                            if (self.txlcol1[-1].txls[bc].left <= 165):
                                break
                            bc -= 1

                txl.indent = txl.left - leftcolstart
                if txl.indent < 0:
                    if IsNotQuiet():
                        print txl.indent, txl.ltext
                    raise unexception(
                        "negative indentation",
                        paranumC(txl.undocname, None, 0, -1,
                                 txl.textcountnumber))
                self.minindentleft = min(txl.indent, self.minindentleft)
                txl.brightcol = False
                AppendToCluster(self.txlcol1, txl)

            else:
                txl.indent = txl.left - rightcolstart
                if txl.indent != 0:
                    txl.indent += rightcolstartindentincrement
                if txl.indent < 0:
                    if IsNotQuiet():
                        print txl.indent, txl.left, rightcolstart
                        print txl.ltext
                    raise unexception(
                        "negative indent on righthand column",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                self.minindentright = min(txl.indent, self.minindentright)
                txl.brightcol = True
                AppendToCluster(self.txlcol2, txl)
Beispiel #7
0
    def ExtractSeccounFrontPage(self, txlines):
        self.date = None
        self.chairs = []
        self.seccouncilmembers = []
        self.agenda = []

        lasttop = -1
        jtxlines = []
        ih = 0
        while ih < len(txlines):
            if txlines[ih].top == lasttop:
                jtxlines[-1] = "%s %s" % (jtxlines[-1], txlines[ih].ltext)
            else:
                jtxlines.append(txlines[ih].ltext)
                lasttop = txlines[ih].top
            ih += 1

        del txlines  # just deletes the reference to this object
        ih = 0
        while ih < len(jtxlines):
            self.ExtractDateTime(None, jtxlines[ih])
            mpresseat = re.match(
                "<i>(President|Chairman|later)(?:</i>:|:\s*</i>)\s*((?:Mr.|Mrs.|Ms.|Sir\.?|Miss|Sheikh|Baroness|Lord|Nana) .*?)\s+\.(?: \.)*\s*(\(.*)?$",
                jtxlines[ih])
            #print jtxlines[ih], mpresseat
            if mpresseat:
                if not self.date:
                    if IsNotQuiet():
                        for i in range(ih):
                            print jtxlines[i]
                    raise unexception(
                        "missingg date",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                if mpresseat.group(1) in ["President", "Chairman"]:
                    assert len(self.chairs) == 0  # first one
                else:
                    assert len(self.chairs) == 1  # later president
                ih += 1
                if mpresseat.group(3):
                    scountry = mpresseat.group(3)
                else:
                    scountry = ""
                if re.search("\(", scountry) and not re.search("\)", scountry):
                    scountry = "%s %s" % (scountry, jtxlines[ih])
                    ih += 1
                mcountry = re.match("\((.*?)\)$", scountry)
                lfscountry = re.sub("\s+", " ", mcountry.group(1))
                fscountry = FixNationName(lfscountry, self.date)
                if not fscountry:
                    if IsNotQuiet():
                        print "--%s--" % mcountry.group(1)
                    raise unexception(
                        "unrecognized nationA",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                chairname = re.sub("\s\s+", " ", mpresseat.group(2)).strip()
                self.chairs.append((chairname, fscountry, "president"))

                if fscountry in self.seccouncilmembers:
                    assert len(self.seccouncilmembers) == 1
                    assert fscountry == "New Zealand"
                    assert self.undocname == "S-PV-3370"
                    assert len(self.chairs) == 2
                    del self.chairs[0]
                    del self.seccouncilmembers[0]

                self.seccouncilmembers.append(fscountry)
                continue

            mcountryseat = re.match(
                "(<i>Members(?:</i>:|:\s*</i>))?\s*([\w\-\s]*?)\s*\.(?: \.)*\s*((?:Mr.|Ms.|Mrs.|Miss|Dr.|Sir\.?|Sheikh|Baroness|Lord|Nana) [^<>]*|absent)$",
                jtxlines[ih])
            if mcountryseat:
                if mcountryseat.group(1):
                    if len(self.chairs) not in [
                            1, 2
                    ]:  # in case of second president
                        if IsNotQuiet():
                            print self.chairs, "chchchch"
                        raise unexception(
                            "chairs not thereB",
                            paranumC(self.undocname, None, 0, -1,
                                     self.textcountnumber))
                else:
                    if len(self.chairs) == 0:
                        if not self.date:  # prob a closed meeting
                            break
                        if IsNotQuiet():
                            print ih, jtxlines[ih]
                        raise unexception(
                            "seat without chair",
                            paranumC(self.undocname, None, 0, -1,
                                     self.textcountnumber))
                lfscountry = re.sub("\s+", " ", mcountryseat.group(2))
                fscountry = FixNationName(lfscountry, self.date)
                if not fscountry:
                    if IsNotQuiet():
                        print "--%s--" % mcountryseat.group(2)
                    raise unexception(
                        "unrecognized nationB",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                chairname = re.sub("\s\s+", " ", mcountryseat.group(3)).strip()
                self.chairs.append((chairname, fscountry, "member"))
                if fscountry not in self.seccouncilmembers:
                    self.seccouncilmembers.append(fscountry)
                else:
                    if IsNotQuiet():
                        print "Repeat-country on council", fscountry
            else:
                if re.search(" \. \. \. \. \. \. ", jtxlines[ih]):
                    if IsNotQuiet():
                        print "--%s--" % jtxlines[ih]
                    raise unexception(
                        "missing country",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
            if re.match("<b>Agenda\s*</b>$", jtxlines[ih]):
                ih += 1
                break
            if re.search("Agenda", jtxlines[ih]):
                print ih, jtxlines
                raise unexception(
                    "unextracted Agenda (should be <b>?)",
                    paranumC(self.undocname, None, 0, -1,
                             self.textcountnumber))
            ih += 1

        # could be a closed meeting
        if not self.date:
            alltext = " ".join(jtxlines)
            if re.search(
                    "OFFICIAL COMMUNIQU..*?Held in private (?:in the Security Council Chamber )?at Headquarters(?i)",
                    alltext):
                return False
            return True

        while ih < len(jtxlines):
            if re.match("\d\d-\d\d", jtxlines[ih]):
                break
            if re.match("\d\d.?\d\d\d\d\d \(E\)", jtxlines[ih]):
                break
            if re.match(
                    "This record contains the text of speeches delivered in English",
                    jtxlines[ih]):
                break
            #print "agagag", jtxlines[ih]
            assert not re.search("text of speeches|verbatim(?i)", jtxlines[ih])
            self.agenda.append(jtxlines[ih].strip())
            ih += 1

        #print "ccccc", self.chairs
        lparanum = paranumC(self.undocname, None, 0, -1, self.textcountnumber)
        if len(self.chairs) not in (15,
                                    17) or len(self.seccouncilmembers) != 15:
            if self.undocname == "S-PV-3446":
                return False
            if IsNotQuiet():
                print len(self.seccouncilmembers), len(
                    self.chairs
                ), "wrong number of members or chairs\n", self.chairs
                print self.seccouncilmembers
            raise unexception("wrongnumber on council", lparanum)

        self.agenda = " ".join(self.agenda)
        self.agenda = re.sub("</?b>", " ", self.agenda)
        self.agenda = re.sub("\s\s+", " ", self.agenda)
        self.agenda = MarkupLinks(
            CleanupTags(self.agenda, "council-agenda", lparanum),
            self.undocname, lparanum)
        return True
Beispiel #8
0
def AppendCluster(res, tlc, sclusttype):
    # check if we should merge to the next paragraph
    assert sclusttype in ["gapcluster", "newpage", "newcolumn"]

    if res and sclusttype != "gapcluster" and len(tlc.indents) == 1:
        indentp = res[-1].indents[-1][0]
        indentn = tlc.indents[0][0]

        bbothindented = ((indentp in [31, 32]) and (indentn in [31, 32])) or \
                        ((indentp in [0, 1]) and (indentn in [0, 1])) or \
                        ((indentp in [36, 33]) and (indentp == indentn))
        bonelineparacont = (len(res[-1].indents)
                            == 1) and (res[-1].indents[0][1] == 1) and (
                                indentp in [31, 32]) and (indentn in [0, 1])

        td0 = res[-1].txls[-1].ltext[:3]
        td1 = tlc.txls[0].ltext[:3]
        if not re.match("<[ib]>", td0):
            td0 = ""
        if not re.match("<[ib]>", td1):
            td1 = ""
        bstylematches = (td0 == td1)
        #assert not (bbothindented and not bstylematches)
        if re.match("<i>In favour", tlc.txls[0].ltext):
            bstylematches = False
        if re.match("<b>Agenda", res[-1].txls[-1].ltext):
            bstylematches = False

        # likely continuation of paragraph
        if bbothindented and bstylematches:
            res[-1].txls.extend(tlc.txls)
            #print tlc.txls[0].ltext
            return
        else:
            if bonelineparacont:
                if IsNotQuiet():
                    pass
                    #print "checkthiscontinuation case"
                    #print indentp, indentn, bstylematches, bonelineparacont, res[-1].indents
                    #print " ----", tlc.txls[0].ltext
                if bstylematches:
                    if IsNotQuiet():
                        pass  #print "merging"
                    res[-1].txls.extend(tlc.txls)
                    return

    # new cluster; check the indenting pattern is good
    if len(tlc.indents) == 2:
        if tlc.indents[0] <= tlc.indents[1]:
            #print tlc.indents, tlc.txls[0].ltext
            #assert re.match("<[ib]>.*?</[ib]>", tlc.txls[0].ltext) # <i>In favour:</i>
            pass

    # two paragraphs may have been merged, try to separate them out
    elif len(tlc.indents) == 4 and tlc.indents[0][0] == tlc.indents[2][
            0] and tlc.indents[1][0] == tlc.indents[3][0]:
        if IsNotQuiet():
            pass  #print tlc.indents
        assert tlc.indents[0][0] == tlc.indents[2][0]
        assert tlc.indents[1][0] == tlc.indents[3][0]
        si = tlc.indents[0][2] + tlc.indents[1][2]
        tlcf = TextLineCluster(None)
        tlcf.txls = tlc.txls[:si]
        del tlc.txls[:si]
        tlcf.indents = tlc.indents[:2]
        del tlc.indents[:2]
        res.append(tlcf)
        if IsNotQuiet():
            pass
            #print "# paragraphs", si
            #print " ", tlc.txls[0].ltext
            #print tlcf.indents, tlc.indents

    elif len(tlc.indents) != 1:
        if IsNotQuiet():
            print tlc.indents, "jjjj"
        prevtop = -1
        for txl in tlc.txls:
            if IsNotQuiet():
                if prevtop == txl.top:
                    print " ",
                print txl.indent, txl.ltext
            prevtop = txl.top
        raise unexception(
            "unrecognized indent pattern",
            paranumC(txl.undocname, None, 0, -1, txl.textcountnumber))
        assert False
    res.append(tlc)
    return