def find_holding_answer_issued(self, s):
     holding_match = re.match(
         '(?ims)^(.*)Holding answer issued: (\d+) (\w+) (\d+)(.*)$', s)
     if holding_match:
         holding_answer_issued = datetime.date(
             int(holding_match.group(4), 10),
             month_name_to_int(holding_match.group(3)),
             int(holding_match.group(2), 10))
         self.set_date_holding_answer_was_issued(holding_answer_issued)
         return holding_match.group(1) + holding_match.group(5)
    def parse(self,filename):

        m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))',filename)
        if not m:
            raise Exception, "Couldn't parse filename: "+filename
        self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % (m.group(2),m.group(3))

        filename_leaf = m.group(1)

        # We need to know what date this is, so deal with that first
        # of all in a brutish fashion, but cache the results:

        self.date = None

        if file_to_date.has_key(filename_leaf):
            if verbose: print "Found file to date mapping in cache."
            self.date = datetime.date(*strptime(file_to_date[filename_leaf],"%Y-%m-%d")[0:3])
        else:
            self.make_soup(filename)            
            page_as_text = tidy_string(non_tag_data_in(self.soup.find('body')))
            m = re.search('(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?',page_as_text)
            if m:
                day_of_week = m.group(1)
                day = m.group(2)
                month = month_name_to_int(m.group(3))
                year = m.group(4)
                # Sometimes the date string doesn't have the year:
                if not year:
                    m = re.search('day-wa-(\d\d)',filename)
                    if m.group(1) == '99':
                        year = '1999'
                    else:
                        year = '20' + m.group(1)
                self.date = datetime.date( int(year,10), month, int(day,10) )
                if not options.quiet: "Adding file to date mapping to cache."
                add_file_to_date_mapping(filename_leaf,str(self.date))
            else:
                raise Exception, "No date found in file: "+filename

        temp_output_filename = xml_output_directory + "tmp.xml"
        output_filename = xml_output_directory + "spwa" + str(self.date) + ".xml"

        if os.path.exists(output_filename):
            #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename)
            # raise Exception, error
            #if not options.quiet: print error
            return

        if not options.quiet: print "Parsing %s" % filename

        self.make_soup(filename)

        self.ofp = open(temp_output_filename,"w")

        self.ofp.write('''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE publicwhip [

<!ENTITY pound   "&#163;">
<!ENTITY euro    "&#8364;">

<!ENTITY agrave  "&#224;">
<!ENTITY aacute  "&#225;">
<!ENTITY egrave  "&#232;">
<!ENTITY eacute  "&#233;">
<!ENTITY ecirc   "&#234;">
<!ENTITY iacute  "&#237;">
<!ENTITY ograve  "&#242;">
<!ENTITY oacute  "&#243;">
<!ENTITY uacute  "&#250;">
<!ENTITY Aacute  "&#193;">
<!ENTITY Eacute  "&#201;">
<!ENTITY Iacute  "&#205;">
<!ENTITY Oacute  "&#211;">
<!ENTITY Uacute  "&#218;">
<!ENTITY Uuml    "&#220;">
<!ENTITY auml    "&#228;">
<!ENTITY euml    "&#235;">
<!ENTITY iuml    "&#239;">
<!ENTITY ouml    "&#246;">
<!ENTITY uuml    "&#252;">
<!ENTITY fnof    "&#402;">
<!ENTITY aelig   "&#230;">
<!ENTITY dagger  "&#8224;">
<!ENTITY reg     "&#174;">
<!ENTITY nbsp    "&#160;">
<!ENTITY shy     "&#173;">
<!ENTITY deg     "&#176;">
<!ENTITY middot  "&#183;">
<!ENTITY ordm    "&#186;">
<!ENTITY ndash   "&#8211;">
<!ENTITY mdash   "&#8212;">
<!ENTITY lsquo   "&#8216;">
<!ENTITY rsquo   "&#8217;">
<!ENTITY ldquo   "&#8220;">
<!ENTITY rdquo   "&#8221;">
<!ENTITY hellip  "&#8230;">
<!ENTITY bull    "&#8226;">

<!ENTITY acirc   "&#226;">
<!ENTITY Agrave  "&#192;">
<!ENTITY Aring   "&#197;">
<!ENTITY aring   "&#229;">
<!ENTITY atilde  "&#227;">
<!ENTITY Ccedil  "&#199;">
<!ENTITY ccedil  "&#231;">
<!ENTITY Egrave  "&#200;">
<!ENTITY Icirc   "&#206;">
<!ENTITY icirc   "&#238;">
<!ENTITY Igrave  "&#204;">
<!ENTITY igrave  "&#236;">
<!ENTITY ntilde  "&#241;">
<!ENTITY ocirc   "&#244;">
<!ENTITY oelig   "&#339;">
<!ENTITY Ograve  "&#210;">
<!ENTITY Oslash  "&#216;">
<!ENTITY oslash  "&#248;">
<!ENTITY Scaron  "&#352;">
<!ENTITY scaron  "&#353;">
<!ENTITY sup1    "&#185;">
<!ENTITY sup2    "&#178;">
<!ENTITY sup3    "&#179;">
<!ENTITY ugrave  "&#249;">
<!ENTITY ucirc   "&#251;">
<!ENTITY Ugrave  "&#217;">
<!ENTITY yacute  "&#253;">
<!ENTITY frac12  "&#189;">
<!ENTITY micro   "&#181;">
<!ENTITY sbquo   "&#8218;">
<!ENTITY trade   "&#8482;">
<!ENTITY Dagger  "&#8225;">
<!ENTITY radic   "&#8730;">
]>

<publicwhip>

''')

        self.ofp.write("<source url=\"%s\"/>" % self.original_url )
        
        tag_with_most_paragraphs = None
        most_paragraphs_so_far = -1
        
        for t in self.soup.findAll(True):
            ps = paragraphs_in_tag(t)
            if ps > most_paragraphs_so_far:
                tag_with_most_paragraphs = t
                most_paragraphs_so_far = ps
        
        if verbose: print "Using element name: "+tag_with_most_paragraphs.name+" with "+str(most_paragraphs_so_far)+" paragraphs from "+filename
        
        if verbose: print tag_with_most_paragraphs.prettify()
        
        # When we're parsing we might have multiple questions in a
        # row.  We say that something's a question rather than an
        # answer if (a) it's followed by an ID or (b) it begins with
        # "To ask", otherwise it's an answer.  If we hit a new
        # heading, that suggests that the previous thing was an answer
        # as well.
        
        # The business of "Holding answers" is a bit confusing.  At
        # the bottom of each page there may be a list of question IDs
        # which were given holding answers, but the text of the
        # question is not in the page - you only find it when the
        # question is eventually answered.
        
        for t in tag_with_most_paragraphs:
            if t.__class__ == NavigableString:
                s = str(t)
                s = re.sub('(?ims)\s+',' ',s)
                if re.match('(?ims)^\s*$',s):
                    continue
                else:
                    self.add_to_paragraph(tidy_string(str(t)))
                if verbose: print "string: "+str(s)            
            elif t.__class__ == Tag:
                # Look for any <a name=""> tags in here:
                a = t.find( lambda p: p.name == 'a' and p.has_key('name') )
                if a:
                    self.sp_name = a['name']
                if t.has_key('align') and t['align'].lower() == 'right':
                    # Right aligned tags just have the question ID.
                    if self.find_id_and_possible_holding_date(t):
                        self.complete_question()
                    else:
                        if verbose: print "Couldn't parse top-level right aligned tag: "+str(t)
                elif t.has_key('class') and t['class'] == 'largeHeading':
                    self.add_large_heading(tidy_string(non_tag_data_in(t)))
                elif self.something_centered(t) or self.c1_heading(t):
                    # Centred tags are headings for questions...
                    s = tidy_string(non_tag_data_in(t))
                    if len(s) > 0:
                        self.complete_answer()
                        if verbose: print "center: "+s
                        self.add_heading(s)
                elif t.name == 'table':
                    # This is probably a table that's inserted just to
                    # right align the question ID.  The left cell may
                    # contain something to indicate that it's a
                    # holding answer.
                    if self.find_id_and_possible_holding_date(t):
                        # Then also look for the "Holding answer
                        # issued" details...
                        s = non_tag_data_in(t)
                        self.find_holding_answer_issued(s)
                        self.complete_question()
                    else:
                        # Then maybe it's a table as part of the
                        # answer, so add it as a paragraph.
                        self.add_paragraph(str(t))
                elif t.name == 'p':
                    if re.search("(The following questions were given holding answers|Questions given holding answers)",tidy_string(non_tag_data_in(t))):
                        if verbose: print "Found the trailing holding question list!"
                        # This indicates the end of the day's report
                        # for us (just ignore the following list of
                        # answers - it's not very interesting until we
                        # parse some later day and we can tell what
                        # the question was...)                        
                        break
                    if verbose: print "Didn't find the trailing holding question list in: "+non_tag_data_in(t)
                    non_empty_contents = filter( lambda x: x.__class__ != NavigableString or not re.match('^\s*$',x), t.contents )
                    if len(non_empty_contents) == 0:
                        continue
                    initial_strong_text = ''
                    while len(non_empty_contents) > 0 and non_empty_contents[0].__class__ == Tag and (non_empty_contents[0].name == 'strong' or non_empty_contents[0].name == 'b'):
                        initial_strong_text += " " + non_tag_data_in(non_empty_contents[0])
                        non_empty_contents = non_empty_contents[1:]
                    if len(initial_strong_text) > 0:
                        speaker_name = tidy_string(initial_strong_text)
                        # In some files this will be the ID (possibly
                        # plus holding indication), not right aligned
                        # as usual :(
                        if self.find_id_and_possible_holding_date(speaker_name):
                            self.complete_question()
                        else:
                            speaker_name = re.sub('(?ims)\s*:\s*$','',speaker_name)
                            speaker_id = self.valid_speaker(speaker_name)
                            if speaker_name and speaker_id:
                                self.complete_answer()
                                self.set_speaker(speaker_name,speaker_id)
                                for e in non_empty_contents:
                                    s = tidy_string(str(e))
                                    self.add_to_paragraph(s)
                            else:
                                self.add_paragraph_removing_enclosure(t)
                    else:
                        self.add_paragraph_removing_enclosure(t)
                elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center':
                    # Just add them in a paragraph anyway, even though
                    # that wouldn't be valid HTML 4 strict in the case
                    # of the last three (IIRC)
                    self.add_paragraph(str(t))
                else:
                    # Well, if it's empty of text we don't care...
                    s = non_tag_data_in(t)
                    if not re.match('(?ims)^\s*$',s):
                        raise Exception, "Unknown tag found of name '"+t.name+"' with text: "+t.prettify()
        self.complete_answer()

        # Now output all the XML, working out IDs for each element.
        # IDs are of the form:
        # 
        #   uk.org.publicwhip/spwa/YYYY-MM-DD.X.T
        # 
        #     .... where:
        #            - YYYY-MM-DD is an ISO 8601 date
        # 
        #            - X is a integer starting at 0 on each day, which
        #              should be incremented for each new heading and
        #              be the same for a group of questions and their
        #              answer.
        #
        #            - T is "mh" or "h" for major and minor headings,
        #             "q0", "q1", "q2", etc. for each group of
        #             questions and "r0", "r1", etc. for the answers

        x = -1
        last_heading = None
        current_sp_id = None

        index = 0

        for i in range(0,len(self.all_stuff)):

            if i > 0:
                previous = self.all_stuff[i-1]
            else:
                previous = None

            if i < (len(self.all_stuff) - 1):
                next = self.all_stuff[i+1]
            else:
                next = None
                
            a = self.all_stuff[i]

            self.ofp.write('\n\n')

            if a.__class__ == Heading:
                last_was_answer = True
                if a.major:
                    subtype = "mh"
                else:
                    subtype = "h"
                if next and next.__class__ == QuestionOrReply and next.sp_id:
                    # Then use the question's sp_id:
                    self.ofp.write(a.to_xml(self.get_id(next.sp_id,subtype)))
                else:
                    x += 1
                    self.ofp.write(a.to_xml(self.get_id(str(x),subtype)))
                last_heading = a
            elif a.__class__ == QuestionOrReply:
                # Occasionally we think questions are actually
                # answers, so check the beginning of the first
                # paragraph:
                if not a.is_question and len(a.paragraphs) > 0 and re.search('^(?ims)\s*To\s+ask',a.paragraphs[0]):
                    a.is_question = True
                # If we're suddenly in an answer, reset index.
                if (not a.is_question) and previous and not (previous.__class__ == QuestionOrReply and not previous.is_question):
                    index = 0
                # If we're suddenly in a question, reset index and increment x unless the previous is a heading
                elif a.is_question:
                    if previous:
                        if previous.__class__ == QuestionOrReply:
                            if previous.is_question:
                                # If the one before is a question, that's fine.
                                current_sp_id = a.sp_id
                            else:
                                current_sp_id = a.sp_id
                                # If the previous one was an answer
                                # then we need to replay the last
                                # heading:
                                if not last_heading:
                                    raise Exception, "Somehow there's been no heading so far."
                                last_heading.sp_name = a.sp_name
                                if current_sp_id:
                                    self.ofp.write(last_heading.to_xml(self.get_id(current_sp_id,"h")))
                                else:
                                    x += 1
                                    self.ofp.write(last_heading.to_xml(self.get_id(str(x),"h")))
                                self.ofp.write("\n\n")
                                index = 0
                        else:
                            # i.e. this is the normal case, a question after a heading:
                            current_sp_id = a.sp_id
                            index = 0
                    else:
                        raise Exception, "Nothing before the first question (no heading)"
                if a.is_question:
                    subtype = "q" + str(index)
                else:
                    subtype = "r" + str(index)
                if current_sp_id:
                    self.ofp.write(a.to_xml(self.get_id(current_sp_id,subtype)))
                else:
                    self.ofp.write(a.to_xml(self.get_id(str(x),subtype)))
                index += 1

        self.ofp.write("</publicwhip>")
        self.ofp.close()

        retcode = call( [ "mv", temp_output_filename, output_filename ] )
        if retcode != 0:
            raise Exception, "Moving "+temp_output_filename+" to "+output_filename+" failed."

        xmlvalidate.parse(output_filename)
        #retcode = call( [ "xmlstarlet", "val", output_filename ] )
        #if retcode != 0:
        #    raise Exception, "Validating "+output_filename+" for well-formedness failed."

        fil = open('%schangedates.txt' % xml_output_directory, 'a+')
        fil.write('%d,spwa%s.xml\n' % (time.time(), self.date))
        fil.close()
 def find_holding_answer_issued(self,s):
     holding_match = re.match('(?ims)^(.*)Holding answer issued: (\d+) (\w+) (\d+)(.*)$',s)
     if holding_match:
         holding_answer_issued = datetime.date(int(holding_match.group(4),10),month_name_to_int(holding_match.group(3)),int(holding_match.group(2),10))
         self.set_date_holding_answer_was_issued(holding_answer_issued)
         return holding_match.group(1) + holding_match.group(5)
                continue
            elif m_day:
                # print "Got day: "+s
                daily_pages.add( (subdir,leaf) )
            elif m_week:
                day_start = m_week.group(1)
                month_start = m_week.group(2)
                year_start = m_week.group(3)
                day_end = m_week.group(4)
                month_end = m_week.group(5)
                year_end = m_week.group(6)
                if not month_start:
                    month_start = month_end
                if not year_start:
                    year_start = year_end
                start_date = datetime.date( int(year_start), month_name_to_int(month_start), int(day_start,10) )
                end_date = datetime.date( int(year_end), month_name_to_int(month_end), int(day_end,10) )
                contents_pages.add( (subdir,leaf,start_date,end_date) )
                contents_hash[subdir+"_"+leaf] = True

    # Fetch all the contents pages:

    for (subdir,leaf,start_date,end_date) in contents_pages:

        contents_filename = output_directory + "contents-"+subdir+"_"+leaf
        contents_url = written_answers_prefix + subdir + "/" + leaf

        # Fetch the contents page if we don't already have it, or if
        # it was the last one fetched:

        if not os.path.exists(contents_filename) or (len(existing_contents_pages) > 0 and existing_contents_pages[-1] == contents_filename):
    def parse(self, filename):

        m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))', filename)
        if not m:
            raise Exception, "Couldn't parse filename: " + filename
        self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % (
            m.group(2), m.group(3))

        filename_leaf = m.group(1)

        # We need to know what date this is, so deal with that first
        # of all in a brutish fashion, but cache the results:

        self.date = None

        if file_to_date.has_key(filename_leaf):
            if verbose: print "Found file to date mapping in cache."
            self.date = datetime.date(
                *strptime(file_to_date[filename_leaf], "%Y-%m-%d")[0:3])
        else:
            self.make_soup(filename)
            page_as_text = tidy_string(non_tag_data_in(self.soup.find('body')))
            m = re.search(
                '(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?',
                page_as_text)
            if m:
                day_of_week = m.group(1)
                day = m.group(2)
                month = month_name_to_int(m.group(3))
                year = m.group(4)
                # Sometimes the date string doesn't have the year:
                if not year:
                    m = re.search('day-wa-(\d\d)', filename)
                    if m.group(1) == '99':
                        year = '1999'
                    else:
                        year = '20' + m.group(1)
                self.date = datetime.date(int(year, 10), month, int(day, 10))
                if not options.quiet: "Adding file to date mapping to cache."
                add_file_to_date_mapping(filename_leaf, str(self.date))
            else:
                raise Exception, "No date found in file: " + filename

        temp_output_filename = xml_output_directory + "tmp.xml"
        output_filename = xml_output_directory + "spwa" + str(
            self.date) + ".xml"

        if os.path.exists(output_filename):
            #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename)
            # raise Exception, error
            #if not options.quiet: print error
            return

        if not options.quiet: print "Parsing %s" % filename

        self.make_soup(filename)

        self.ofp = open(temp_output_filename, "w")

        self.ofp.write('''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE publicwhip [

<!ENTITY pound   "&#163;">
<!ENTITY euro    "&#8364;">

<!ENTITY agrave  "&#224;">
<!ENTITY aacute  "&#225;">
<!ENTITY egrave  "&#232;">
<!ENTITY eacute  "&#233;">
<!ENTITY ecirc   "&#234;">
<!ENTITY iacute  "&#237;">
<!ENTITY ograve  "&#242;">
<!ENTITY oacute  "&#243;">
<!ENTITY uacute  "&#250;">
<!ENTITY Aacute  "&#193;">
<!ENTITY Eacute  "&#201;">
<!ENTITY Iacute  "&#205;">
<!ENTITY Oacute  "&#211;">
<!ENTITY Uacute  "&#218;">
<!ENTITY Uuml    "&#220;">
<!ENTITY auml    "&#228;">
<!ENTITY euml    "&#235;">
<!ENTITY iuml    "&#239;">
<!ENTITY ouml    "&#246;">
<!ENTITY uuml    "&#252;">
<!ENTITY fnof    "&#402;">
<!ENTITY aelig   "&#230;">
<!ENTITY dagger  "&#8224;">
<!ENTITY reg     "&#174;">
<!ENTITY nbsp    "&#160;">
<!ENTITY shy     "&#173;">
<!ENTITY deg     "&#176;">
<!ENTITY middot  "&#183;">
<!ENTITY ordm    "&#186;">
<!ENTITY ndash   "&#8211;">
<!ENTITY mdash   "&#8212;">
<!ENTITY lsquo   "&#8216;">
<!ENTITY rsquo   "&#8217;">
<!ENTITY ldquo   "&#8220;">
<!ENTITY rdquo   "&#8221;">
<!ENTITY hellip  "&#8230;">
<!ENTITY bull    "&#8226;">

<!ENTITY acirc   "&#226;">
<!ENTITY Agrave  "&#192;">
<!ENTITY Aring   "&#197;">
<!ENTITY aring   "&#229;">
<!ENTITY atilde  "&#227;">
<!ENTITY Ccedil  "&#199;">
<!ENTITY ccedil  "&#231;">
<!ENTITY Egrave  "&#200;">
<!ENTITY Icirc   "&#206;">
<!ENTITY icirc   "&#238;">
<!ENTITY Igrave  "&#204;">
<!ENTITY igrave  "&#236;">
<!ENTITY ntilde  "&#241;">
<!ENTITY ocirc   "&#244;">
<!ENTITY oelig   "&#339;">
<!ENTITY Ograve  "&#210;">
<!ENTITY Oslash  "&#216;">
<!ENTITY oslash  "&#248;">
<!ENTITY Scaron  "&#352;">
<!ENTITY scaron  "&#353;">
<!ENTITY sup1    "&#185;">
<!ENTITY sup2    "&#178;">
<!ENTITY sup3    "&#179;">
<!ENTITY ugrave  "&#249;">
<!ENTITY ucirc   "&#251;">
<!ENTITY Ugrave  "&#217;">
<!ENTITY yacute  "&#253;">
<!ENTITY frac12  "&#189;">
<!ENTITY micro   "&#181;">
<!ENTITY sbquo   "&#8218;">
<!ENTITY trade   "&#8482;">
<!ENTITY Dagger  "&#8225;">
<!ENTITY radic   "&#8730;">
]>

<publicwhip>

''')

        self.ofp.write("<source url=\"%s\"/>" % self.original_url)

        tag_with_most_paragraphs = None
        most_paragraphs_so_far = -1

        for t in self.soup.findAll(True):
            ps = paragraphs_in_tag(t)
            if ps > most_paragraphs_so_far:
                tag_with_most_paragraphs = t
                most_paragraphs_so_far = ps

        if verbose:
            print "Using element name: " + tag_with_most_paragraphs.name + " with " + str(
                most_paragraphs_so_far) + " paragraphs from " + filename

        if verbose: print tag_with_most_paragraphs.prettify()

        # When we're parsing we might have multiple questions in a
        # row.  We say that something's a question rather than an
        # answer if (a) it's followed by an ID or (b) it begins with
        # "To ask", otherwise it's an answer.  If we hit a new
        # heading, that suggests that the previous thing was an answer
        # as well.

        # The business of "Holding answers" is a bit confusing.  At
        # the bottom of each page there may be a list of question IDs
        # which were given holding answers, but the text of the
        # question is not in the page - you only find it when the
        # question is eventually answered.

        for t in tag_with_most_paragraphs:
            if t.__class__ == NavigableString:
                s = str(t)
                s = re.sub('(?ims)\s+', ' ', s)
                if re.match('(?ims)^\s*$', s):
                    continue
                else:
                    self.add_to_paragraph(tidy_string(str(t)))
                if verbose: print "string: " + str(s)
            elif t.__class__ == Tag:
                # Look for any <a name=""> tags in here:
                a = t.find(lambda p: p.name == 'a' and p.has_key('name'))
                if a:
                    self.sp_name = a['name']
                if t.has_key('align') and t['align'].lower() == 'right':
                    # Right aligned tags just have the question ID.
                    if self.find_id_and_possible_holding_date(t):
                        self.complete_question()
                    else:
                        if verbose:
                            print "Couldn't parse top-level right aligned tag: " + str(
                                t)
                elif t.has_key('class') and t['class'] == 'largeHeading':
                    self.add_large_heading(tidy_string(non_tag_data_in(t)))
                elif self.something_centered(t) or self.c1_heading(t):
                    # Centred tags are headings for questions...
                    s = tidy_string(non_tag_data_in(t))
                    if len(s) > 0:
                        self.complete_answer()
                        if verbose: print "center: " + s
                        self.add_heading(s)
                elif t.name == 'table':
                    # This is probably a table that's inserted just to
                    # right align the question ID.  The left cell may
                    # contain something to indicate that it's a
                    # holding answer.
                    if self.find_id_and_possible_holding_date(t):
                        # Then also look for the "Holding answer
                        # issued" details...
                        s = non_tag_data_in(t)
                        self.find_holding_answer_issued(s)
                        self.complete_question()
                    else:
                        # Then maybe it's a table as part of the
                        # answer, so add it as a paragraph.
                        self.add_paragraph(str(t))
                elif t.name == 'p':
                    if re.search(
                            "(The following questions were given holding answers|Questions given holding answers)",
                            tidy_string(non_tag_data_in(t))):
                        if verbose:
                            print "Found the trailing holding question list!"
                        # This indicates the end of the day's report
                        # for us (just ignore the following list of
                        # answers - it's not very interesting until we
                        # parse some later day and we can tell what
                        # the question was...)
                        break
                    if verbose:
                        print "Didn't find the trailing holding question list in: " + non_tag_data_in(
                            t)
                    non_empty_contents = filter(
                        lambda x: x.__class__ != NavigableString or not re.
                        match('^\s*$', x), t.contents)
                    if len(non_empty_contents) == 0:
                        continue
                    initial_strong_text = ''
                    while len(non_empty_contents) > 0 and non_empty_contents[
                            0].__class__ == Tag and (
                                non_empty_contents[0].name == 'strong'
                                or non_empty_contents[0].name == 'b'):
                        initial_strong_text += " " + non_tag_data_in(
                            non_empty_contents[0])
                        non_empty_contents = non_empty_contents[1:]
                    if len(initial_strong_text) > 0:
                        speaker_name = tidy_string(initial_strong_text)
                        # In some files this will be the ID (possibly
                        # plus holding indication), not right aligned
                        # as usual :(
                        if self.find_id_and_possible_holding_date(
                                speaker_name):
                            self.complete_question()
                        else:
                            speaker_name = re.sub('(?ims)\s*:\s*$', '',
                                                  speaker_name)
                            person_id = self.valid_speaker(speaker_name)
                            if speaker_name and person_id:
                                self.complete_answer()
                                self.set_speaker(speaker_name, person_id)
                                for e in non_empty_contents:
                                    s = tidy_string(str(e))
                                    self.add_to_paragraph(s)
                            else:
                                self.add_paragraph_removing_enclosure(t)
                    else:
                        self.add_paragraph_removing_enclosure(t)
                elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center':
                    # Just add them in a paragraph anyway, even though
                    # that wouldn't be valid HTML 4 strict in the case
                    # of the last three (IIRC)
                    self.add_paragraph(str(t))
                else:
                    # Well, if it's empty of text we don't care...
                    s = non_tag_data_in(t)
                    if not re.match('(?ims)^\s*$', s):
                        raise Exception, "Unknown tag found of name '" + t.name + "' with text: " + t.prettify(
                        )
        self.complete_answer()

        # Now output all the XML, working out IDs for each element.
        # IDs are of the form:
        #
        #   uk.org.publicwhip/spwa/YYYY-MM-DD.X.T
        #
        #     .... where:
        #            - YYYY-MM-DD is an ISO 8601 date
        #
        #            - X is a integer starting at 0 on each day, which
        #              should be incremented for each new heading and
        #              be the same for a group of questions and their
        #              answer.
        #
        #            - T is "mh" or "h" for major and minor headings,
        #             "q0", "q1", "q2", etc. for each group of
        #             questions and "r0", "r1", etc. for the answers

        x = -1
        last_heading = None
        current_sp_id = None

        index = 0

        for i in range(0, len(self.all_stuff)):

            if i > 0:
                previous = self.all_stuff[i - 1]
            else:
                previous = None

            if i < (len(self.all_stuff) - 1):
                next = self.all_stuff[i + 1]
            else:
                next = None

            a = self.all_stuff[i]

            self.ofp.write('\n\n')

            if a.__class__ == Heading:
                last_was_answer = True
                if a.major:
                    subtype = "mh"
                else:
                    subtype = "h"
                if next and next.__class__ == QuestionOrReply and next.sp_id:
                    # Then use the question's sp_id:
                    self.ofp.write(a.to_xml(self.get_id(next.sp_id, subtype)))
                else:
                    x += 1
                    self.ofp.write(a.to_xml(self.get_id(str(x), subtype)))
                last_heading = a
            elif a.__class__ == QuestionOrReply:
                # Occasionally we think questions are actually
                # answers, so check the beginning of the first
                # paragraph:
                if not a.is_question and len(a.paragraphs) > 0 and re.search(
                        '^(?ims)\s*To\s+ask', a.paragraphs[0]):
                    a.is_question = True
                # If we're suddenly in an answer, reset index.
                if (not a.is_question) and previous and not (
                        previous.__class__ == QuestionOrReply
                        and not previous.is_question):
                    index = 0
                # If we're suddenly in a question, reset index and increment x unless the previous is a heading
                elif a.is_question:
                    if previous:
                        if previous.__class__ == QuestionOrReply:
                            if previous.is_question:
                                # If the one before is a question, that's fine.
                                current_sp_id = a.sp_id
                            else:
                                current_sp_id = a.sp_id
                                # If the previous one was an answer
                                # then we need to replay the last
                                # heading:
                                if not last_heading:
                                    raise Exception, "Somehow there's been no heading so far."
                                last_heading.sp_name = a.sp_name
                                if current_sp_id:
                                    self.ofp.write(
                                        last_heading.to_xml(
                                            self.get_id(current_sp_id, "h")))
                                else:
                                    x += 1
                                    self.ofp.write(
                                        last_heading.to_xml(
                                            self.get_id(str(x), "h")))
                                self.ofp.write("\n\n")
                                index = 0
                        else:
                            # i.e. this is the normal case, a question after a heading:
                            current_sp_id = a.sp_id
                            index = 0
                    else:
                        raise Exception, "Nothing before the first question (no heading)"
                if a.is_question:
                    subtype = "q" + str(index)
                else:
                    subtype = "r" + str(index)
                if current_sp_id:
                    self.ofp.write(
                        a.to_xml(self.get_id(current_sp_id, subtype)))
                else:
                    self.ofp.write(a.to_xml(self.get_id(str(x), subtype)))
                index += 1

        self.ofp.write("</publicwhip>")
        self.ofp.close()

        retcode = call(["mv", temp_output_filename, output_filename])
        if retcode != 0:
            raise Exception, "Moving " + temp_output_filename + " to " + output_filename + " failed."

        xmlvalidate.parse(output_filename)
        #retcode = call( [ "xmlstarlet", "val", output_filename ] )
        #if retcode != 0:
        #    raise Exception, "Validating "+output_filename+" for well-formedness failed."

        fil = open('%schangedates.txt' % xml_output_directory, 'a+')
        fil.write('%d,spwa%s.xml\n' % (time.time(), self.date))
        fil.close()
    day_body = day_soup.find('body')
    if day_body:
        page_as_text = non_tag_data_in(day_body)
    else:
        error = "File couldn't be parsed by MinimalSoup: "+day_filename
        raise Exception, error

    # Now guess the date from the file contents as well:
    m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text)
    if not m:
        m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text)
    if m:
        day_of_week = m.group(2)
        day = m.group(3)
        month = month_name_to_int(m.group(4))
        if month == 0:
            print "Whole match was '" + str(m.group(0)) + "'"
            raise Exception, "Month name '"+m.group(4)+"' not known in file: "+day_filename
        else:
            year = m.group(6)
            # Sometimes the date string doesn't have the year:
            if not year:
                m = re.search('(?i)day-[ab]b-(\d\d)',day_filename)
                if m.group(1) == '99':
                    year = '1999'
                else:
                    year = '20' + m.group(1)
            date_from_filecontents = datetime.date( int(year,10), month, int(day,10) )

    if date_from_filename == date_from_filecontents:
Exemple #7
0
    else:
        error = "File couldn't be parsed by MinimalSoup: " + day_filename
        raise Exception, error

    # Now guess the date from the file contents as well:
    m = re.search(
        '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',
        page_as_text)
    if not m:
        m = re.search(
            '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',
            page_as_text)
    if m:
        day_of_week = m.group(2)
        day = m.group(3)
        month = month_name_to_int(m.group(4))
        if month == 0:
            print "Whole match was '" + str(m.group(0)) + "'"
            raise Exception, "Month name '" + m.group(
                4) + "' not known in file: " + day_filename
        else:
            year = m.group(6)
            # Sometimes the date string doesn't have the year:
            if not year:
                m = re.search('(?i)day-[ab]b-(\d\d)', day_filename)
                if m.group(1) == '99':
                    year = '1999'
                else:
                    year = '20' + m.group(1)
            date_from_filecontents = datetime.date(int(year, 10), month,
                                                   int(day, 10))
    for t in link_tags:
        # old format link - /or-10/sor1223-01.htm
        # new format link - http://www.scottish.parliament.uk/Apps2/Business/ORSearch/ReportView.aspx?r=6132&amp;mode=html
        if t.has_key('href') and (re.match('^or-',t['href']) or re.search('ORSearch/ReportView.aspx.*?mode=html', t['href'])):
            # print t
            s = ""
            for c in t.contents:
                if type(c) == NavigableString:
                    s = s + str(c)
            s = re.sub(',','',s)
            # print year_index_filename + "==> " + s
            d = None
            m = re.match( '^(Official Report)?\s*(\d+)\s+(\w+)', s )
            if not m:
                raise Exception, "Unrecognized date format in '%s'" % s
            d = datetime.date( year, month_name_to_int(m.group(3)), int(m.group(2)) )

            page = str(t['href'])

            contents_url = official_reports_prefix + page
            contents_last_modified = None

            if fetched_urls_hash.has_key(contents_url):
                continue

            output_filename = official_report_template %  ( str(d), 0 )
            if not os.path.exists(output_filename):
                if options.verbose: print "Fetching %s" % contents_url
                ur = urllib.urlopen(contents_url)
                if ur.info().has_key('last-modified'):
                    contents_last_modified = ur.info()['last-modified']