def find_holding_answer_issued(self, s): holding_match = re.match( '(?ims)^(.*)Holding answer issued: (\d+) (\w+) (\d+)(.*)$', s) if holding_match: holding_answer_issued = datetime.date( int(holding_match.group(4), 10), month_name_to_int(holding_match.group(3)), int(holding_match.group(2), 10)) self.set_date_holding_answer_was_issued(holding_answer_issued) return holding_match.group(1) + holding_match.group(5)
def parse(self,filename): m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))',filename) if not m: raise Exception, "Couldn't parse filename: "+filename self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % (m.group(2),m.group(3)) filename_leaf = m.group(1) # We need to know what date this is, so deal with that first # of all in a brutish fashion, but cache the results: self.date = None if file_to_date.has_key(filename_leaf): if verbose: print "Found file to date mapping in cache." self.date = datetime.date(*strptime(file_to_date[filename_leaf],"%Y-%m-%d")[0:3]) else: self.make_soup(filename) page_as_text = tidy_string(non_tag_data_in(self.soup.find('body'))) m = re.search('(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?',page_as_text) if m: day_of_week = m.group(1) day = m.group(2) month = month_name_to_int(m.group(3)) year = m.group(4) # Sometimes the date string doesn't have the year: if not year: m = re.search('day-wa-(\d\d)',filename) if m.group(1) == '99': year = '1999' else: year = '20' + m.group(1) self.date = datetime.date( int(year,10), month, int(day,10) ) if not options.quiet: "Adding file to date mapping to cache." add_file_to_date_mapping(filename_leaf,str(self.date)) else: raise Exception, "No date found in file: "+filename temp_output_filename = xml_output_directory + "tmp.xml" output_filename = xml_output_directory + "spwa" + str(self.date) + ".xml" if os.path.exists(output_filename): #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename) # raise Exception, error #if not options.quiet: print error return if not options.quiet: print "Parsing %s" % filename self.make_soup(filename) self.ofp = open(temp_output_filename,"w") self.ofp.write('''<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE publicwhip [ <!ENTITY pound "£"> <!ENTITY euro "€"> <!ENTITY agrave "à"> <!ENTITY aacute "á"> <!ENTITY egrave "è"> <!ENTITY eacute "é"> <!ENTITY ecirc "ê"> <!ENTITY iacute "í"> <!ENTITY ograve "ò"> <!ENTITY oacute "ó"> <!ENTITY uacute "ú"> <!ENTITY Aacute "Á"> <!ENTITY Eacute "É"> <!ENTITY Iacute "Í"> <!ENTITY Oacute "Ó"> <!ENTITY Uacute "Ú"> <!ENTITY Uuml "Ü"> <!ENTITY auml "ä"> <!ENTITY euml "ë"> <!ENTITY iuml "ï"> <!ENTITY ouml "ö"> <!ENTITY uuml "ü"> <!ENTITY fnof "ƒ"> <!ENTITY aelig "æ"> <!ENTITY dagger "†"> <!ENTITY reg "®"> <!ENTITY nbsp " "> <!ENTITY shy "­"> <!ENTITY deg "°"> <!ENTITY middot "·"> <!ENTITY ordm "º"> <!ENTITY ndash "–"> <!ENTITY mdash "—"> <!ENTITY lsquo "‘"> <!ENTITY rsquo "’"> <!ENTITY ldquo "“"> <!ENTITY rdquo "”"> <!ENTITY hellip "…"> <!ENTITY bull "•"> <!ENTITY acirc "â"> <!ENTITY Agrave "À"> <!ENTITY Aring "Å"> <!ENTITY aring "å"> <!ENTITY atilde "ã"> <!ENTITY Ccedil "Ç"> <!ENTITY ccedil "ç"> <!ENTITY Egrave "È"> <!ENTITY Icirc "Î"> <!ENTITY icirc "î"> <!ENTITY Igrave "Ì"> <!ENTITY igrave "ì"> <!ENTITY ntilde "ñ"> <!ENTITY ocirc "ô"> <!ENTITY oelig "œ"> <!ENTITY Ograve "Ò"> <!ENTITY Oslash "Ø"> <!ENTITY oslash "ø"> <!ENTITY Scaron "Š"> <!ENTITY scaron "š"> <!ENTITY sup1 "¹"> <!ENTITY sup2 "²"> <!ENTITY sup3 "³"> <!ENTITY ugrave "ù"> <!ENTITY ucirc "û"> <!ENTITY Ugrave "Ù"> <!ENTITY yacute "ý"> <!ENTITY frac12 "½"> <!ENTITY micro "µ"> <!ENTITY sbquo "‚"> <!ENTITY trade "™"> <!ENTITY Dagger "‡"> <!ENTITY radic "√"> ]> <publicwhip> ''') self.ofp.write("<source url=\"%s\"/>" % self.original_url ) tag_with_most_paragraphs = None most_paragraphs_so_far = -1 for t in self.soup.findAll(True): ps = paragraphs_in_tag(t) if ps > most_paragraphs_so_far: tag_with_most_paragraphs = t most_paragraphs_so_far = ps if verbose: print "Using element name: "+tag_with_most_paragraphs.name+" with "+str(most_paragraphs_so_far)+" paragraphs from "+filename if verbose: print tag_with_most_paragraphs.prettify() # When we're parsing we might have multiple questions in a # row. We say that something's a question rather than an # answer if (a) it's followed by an ID or (b) it begins with # "To ask", otherwise it's an answer. If we hit a new # heading, that suggests that the previous thing was an answer # as well. # The business of "Holding answers" is a bit confusing. At # the bottom of each page there may be a list of question IDs # which were given holding answers, but the text of the # question is not in the page - you only find it when the # question is eventually answered. for t in tag_with_most_paragraphs: if t.__class__ == NavigableString: s = str(t) s = re.sub('(?ims)\s+',' ',s) if re.match('(?ims)^\s*$',s): continue else: self.add_to_paragraph(tidy_string(str(t))) if verbose: print "string: "+str(s) elif t.__class__ == Tag: # Look for any <a name=""> tags in here: a = t.find( lambda p: p.name == 'a' and p.has_key('name') ) if a: self.sp_name = a['name'] if t.has_key('align') and t['align'].lower() == 'right': # Right aligned tags just have the question ID. if self.find_id_and_possible_holding_date(t): self.complete_question() else: if verbose: print "Couldn't parse top-level right aligned tag: "+str(t) elif t.has_key('class') and t['class'] == 'largeHeading': self.add_large_heading(tidy_string(non_tag_data_in(t))) elif self.something_centered(t) or self.c1_heading(t): # Centred tags are headings for questions... s = tidy_string(non_tag_data_in(t)) if len(s) > 0: self.complete_answer() if verbose: print "center: "+s self.add_heading(s) elif t.name == 'table': # This is probably a table that's inserted just to # right align the question ID. The left cell may # contain something to indicate that it's a # holding answer. if self.find_id_and_possible_holding_date(t): # Then also look for the "Holding answer # issued" details... s = non_tag_data_in(t) self.find_holding_answer_issued(s) self.complete_question() else: # Then maybe it's a table as part of the # answer, so add it as a paragraph. self.add_paragraph(str(t)) elif t.name == 'p': if re.search("(The following questions were given holding answers|Questions given holding answers)",tidy_string(non_tag_data_in(t))): if verbose: print "Found the trailing holding question list!" # This indicates the end of the day's report # for us (just ignore the following list of # answers - it's not very interesting until we # parse some later day and we can tell what # the question was...) break if verbose: print "Didn't find the trailing holding question list in: "+non_tag_data_in(t) non_empty_contents = filter( lambda x: x.__class__ != NavigableString or not re.match('^\s*$',x), t.contents ) if len(non_empty_contents) == 0: continue initial_strong_text = '' while len(non_empty_contents) > 0 and non_empty_contents[0].__class__ == Tag and (non_empty_contents[0].name == 'strong' or non_empty_contents[0].name == 'b'): initial_strong_text += " " + non_tag_data_in(non_empty_contents[0]) non_empty_contents = non_empty_contents[1:] if len(initial_strong_text) > 0: speaker_name = tidy_string(initial_strong_text) # In some files this will be the ID (possibly # plus holding indication), not right aligned # as usual :( if self.find_id_and_possible_holding_date(speaker_name): self.complete_question() else: speaker_name = re.sub('(?ims)\s*:\s*$','',speaker_name) speaker_id = self.valid_speaker(speaker_name) if speaker_name and speaker_id: self.complete_answer() self.set_speaker(speaker_name,speaker_id) for e in non_empty_contents: s = tidy_string(str(e)) self.add_to_paragraph(s) else: self.add_paragraph_removing_enclosure(t) else: self.add_paragraph_removing_enclosure(t) elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center': # Just add them in a paragraph anyway, even though # that wouldn't be valid HTML 4 strict in the case # of the last three (IIRC) self.add_paragraph(str(t)) else: # Well, if it's empty of text we don't care... s = non_tag_data_in(t) if not re.match('(?ims)^\s*$',s): raise Exception, "Unknown tag found of name '"+t.name+"' with text: "+t.prettify() self.complete_answer() # Now output all the XML, working out IDs for each element. # IDs are of the form: # # uk.org.publicwhip/spwa/YYYY-MM-DD.X.T # # .... where: # - YYYY-MM-DD is an ISO 8601 date # # - X is a integer starting at 0 on each day, which # should be incremented for each new heading and # be the same for a group of questions and their # answer. # # - T is "mh" or "h" for major and minor headings, # "q0", "q1", "q2", etc. for each group of # questions and "r0", "r1", etc. for the answers x = -1 last_heading = None current_sp_id = None index = 0 for i in range(0,len(self.all_stuff)): if i > 0: previous = self.all_stuff[i-1] else: previous = None if i < (len(self.all_stuff) - 1): next = self.all_stuff[i+1] else: next = None a = self.all_stuff[i] self.ofp.write('\n\n') if a.__class__ == Heading: last_was_answer = True if a.major: subtype = "mh" else: subtype = "h" if next and next.__class__ == QuestionOrReply and next.sp_id: # Then use the question's sp_id: self.ofp.write(a.to_xml(self.get_id(next.sp_id,subtype))) else: x += 1 self.ofp.write(a.to_xml(self.get_id(str(x),subtype))) last_heading = a elif a.__class__ == QuestionOrReply: # Occasionally we think questions are actually # answers, so check the beginning of the first # paragraph: if not a.is_question and len(a.paragraphs) > 0 and re.search('^(?ims)\s*To\s+ask',a.paragraphs[0]): a.is_question = True # If we're suddenly in an answer, reset index. if (not a.is_question) and previous and not (previous.__class__ == QuestionOrReply and not previous.is_question): index = 0 # If we're suddenly in a question, reset index and increment x unless the previous is a heading elif a.is_question: if previous: if previous.__class__ == QuestionOrReply: if previous.is_question: # If the one before is a question, that's fine. current_sp_id = a.sp_id else: current_sp_id = a.sp_id # If the previous one was an answer # then we need to replay the last # heading: if not last_heading: raise Exception, "Somehow there's been no heading so far." last_heading.sp_name = a.sp_name if current_sp_id: self.ofp.write(last_heading.to_xml(self.get_id(current_sp_id,"h"))) else: x += 1 self.ofp.write(last_heading.to_xml(self.get_id(str(x),"h"))) self.ofp.write("\n\n") index = 0 else: # i.e. this is the normal case, a question after a heading: current_sp_id = a.sp_id index = 0 else: raise Exception, "Nothing before the first question (no heading)" if a.is_question: subtype = "q" + str(index) else: subtype = "r" + str(index) if current_sp_id: self.ofp.write(a.to_xml(self.get_id(current_sp_id,subtype))) else: self.ofp.write(a.to_xml(self.get_id(str(x),subtype))) index += 1 self.ofp.write("</publicwhip>") self.ofp.close() retcode = call( [ "mv", temp_output_filename, output_filename ] ) if retcode != 0: raise Exception, "Moving "+temp_output_filename+" to "+output_filename+" failed." xmlvalidate.parse(output_filename) #retcode = call( [ "xmlstarlet", "val", output_filename ] ) #if retcode != 0: # raise Exception, "Validating "+output_filename+" for well-formedness failed." fil = open('%schangedates.txt' % xml_output_directory, 'a+') fil.write('%d,spwa%s.xml\n' % (time.time(), self.date)) fil.close()
def find_holding_answer_issued(self,s): holding_match = re.match('(?ims)^(.*)Holding answer issued: (\d+) (\w+) (\d+)(.*)$',s) if holding_match: holding_answer_issued = datetime.date(int(holding_match.group(4),10),month_name_to_int(holding_match.group(3)),int(holding_match.group(2),10)) self.set_date_holding_answer_was_issued(holding_answer_issued) return holding_match.group(1) + holding_match.group(5)
continue elif m_day: # print "Got day: "+s daily_pages.add( (subdir,leaf) ) elif m_week: day_start = m_week.group(1) month_start = m_week.group(2) year_start = m_week.group(3) day_end = m_week.group(4) month_end = m_week.group(5) year_end = m_week.group(6) if not month_start: month_start = month_end if not year_start: year_start = year_end start_date = datetime.date( int(year_start), month_name_to_int(month_start), int(day_start,10) ) end_date = datetime.date( int(year_end), month_name_to_int(month_end), int(day_end,10) ) contents_pages.add( (subdir,leaf,start_date,end_date) ) contents_hash[subdir+"_"+leaf] = True # Fetch all the contents pages: for (subdir,leaf,start_date,end_date) in contents_pages: contents_filename = output_directory + "contents-"+subdir+"_"+leaf contents_url = written_answers_prefix + subdir + "/" + leaf # Fetch the contents page if we don't already have it, or if # it was the last one fetched: if not os.path.exists(contents_filename) or (len(existing_contents_pages) > 0 and existing_contents_pages[-1] == contents_filename):
def parse(self, filename): m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))', filename) if not m: raise Exception, "Couldn't parse filename: " + filename self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % ( m.group(2), m.group(3)) filename_leaf = m.group(1) # We need to know what date this is, so deal with that first # of all in a brutish fashion, but cache the results: self.date = None if file_to_date.has_key(filename_leaf): if verbose: print "Found file to date mapping in cache." self.date = datetime.date( *strptime(file_to_date[filename_leaf], "%Y-%m-%d")[0:3]) else: self.make_soup(filename) page_as_text = tidy_string(non_tag_data_in(self.soup.find('body'))) m = re.search( '(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?', page_as_text) if m: day_of_week = m.group(1) day = m.group(2) month = month_name_to_int(m.group(3)) year = m.group(4) # Sometimes the date string doesn't have the year: if not year: m = re.search('day-wa-(\d\d)', filename) if m.group(1) == '99': year = '1999' else: year = '20' + m.group(1) self.date = datetime.date(int(year, 10), month, int(day, 10)) if not options.quiet: "Adding file to date mapping to cache." add_file_to_date_mapping(filename_leaf, str(self.date)) else: raise Exception, "No date found in file: " + filename temp_output_filename = xml_output_directory + "tmp.xml" output_filename = xml_output_directory + "spwa" + str( self.date) + ".xml" if os.path.exists(output_filename): #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename) # raise Exception, error #if not options.quiet: print error return if not options.quiet: print "Parsing %s" % filename self.make_soup(filename) self.ofp = open(temp_output_filename, "w") self.ofp.write('''<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE publicwhip [ <!ENTITY pound "£"> <!ENTITY euro "€"> <!ENTITY agrave "à"> <!ENTITY aacute "á"> <!ENTITY egrave "è"> <!ENTITY eacute "é"> <!ENTITY ecirc "ê"> <!ENTITY iacute "í"> <!ENTITY ograve "ò"> <!ENTITY oacute "ó"> <!ENTITY uacute "ú"> <!ENTITY Aacute "Á"> <!ENTITY Eacute "É"> <!ENTITY Iacute "Í"> <!ENTITY Oacute "Ó"> <!ENTITY Uacute "Ú"> <!ENTITY Uuml "Ü"> <!ENTITY auml "ä"> <!ENTITY euml "ë"> <!ENTITY iuml "ï"> <!ENTITY ouml "ö"> <!ENTITY uuml "ü"> <!ENTITY fnof "ƒ"> <!ENTITY aelig "æ"> <!ENTITY dagger "†"> <!ENTITY reg "®"> <!ENTITY nbsp " "> <!ENTITY shy "­"> <!ENTITY deg "°"> <!ENTITY middot "·"> <!ENTITY ordm "º"> <!ENTITY ndash "–"> <!ENTITY mdash "—"> <!ENTITY lsquo "‘"> <!ENTITY rsquo "’"> <!ENTITY ldquo "“"> <!ENTITY rdquo "”"> <!ENTITY hellip "…"> <!ENTITY bull "•"> <!ENTITY acirc "â"> <!ENTITY Agrave "À"> <!ENTITY Aring "Å"> <!ENTITY aring "å"> <!ENTITY atilde "ã"> <!ENTITY Ccedil "Ç"> <!ENTITY ccedil "ç"> <!ENTITY Egrave "È"> <!ENTITY Icirc "Î"> <!ENTITY icirc "î"> <!ENTITY Igrave "Ì"> <!ENTITY igrave "ì"> <!ENTITY ntilde "ñ"> <!ENTITY ocirc "ô"> <!ENTITY oelig "œ"> <!ENTITY Ograve "Ò"> <!ENTITY Oslash "Ø"> <!ENTITY oslash "ø"> <!ENTITY Scaron "Š"> <!ENTITY scaron "š"> <!ENTITY sup1 "¹"> <!ENTITY sup2 "²"> <!ENTITY sup3 "³"> <!ENTITY ugrave "ù"> <!ENTITY ucirc "û"> <!ENTITY Ugrave "Ù"> <!ENTITY yacute "ý"> <!ENTITY frac12 "½"> <!ENTITY micro "µ"> <!ENTITY sbquo "‚"> <!ENTITY trade "™"> <!ENTITY Dagger "‡"> <!ENTITY radic "√"> ]> <publicwhip> ''') self.ofp.write("<source url=\"%s\"/>" % self.original_url) tag_with_most_paragraphs = None most_paragraphs_so_far = -1 for t in self.soup.findAll(True): ps = paragraphs_in_tag(t) if ps > most_paragraphs_so_far: tag_with_most_paragraphs = t most_paragraphs_so_far = ps if verbose: print "Using element name: " + tag_with_most_paragraphs.name + " with " + str( most_paragraphs_so_far) + " paragraphs from " + filename if verbose: print tag_with_most_paragraphs.prettify() # When we're parsing we might have multiple questions in a # row. We say that something's a question rather than an # answer if (a) it's followed by an ID or (b) it begins with # "To ask", otherwise it's an answer. If we hit a new # heading, that suggests that the previous thing was an answer # as well. # The business of "Holding answers" is a bit confusing. At # the bottom of each page there may be a list of question IDs # which were given holding answers, but the text of the # question is not in the page - you only find it when the # question is eventually answered. for t in tag_with_most_paragraphs: if t.__class__ == NavigableString: s = str(t) s = re.sub('(?ims)\s+', ' ', s) if re.match('(?ims)^\s*$', s): continue else: self.add_to_paragraph(tidy_string(str(t))) if verbose: print "string: " + str(s) elif t.__class__ == Tag: # Look for any <a name=""> tags in here: a = t.find(lambda p: p.name == 'a' and p.has_key('name')) if a: self.sp_name = a['name'] if t.has_key('align') and t['align'].lower() == 'right': # Right aligned tags just have the question ID. if self.find_id_and_possible_holding_date(t): self.complete_question() else: if verbose: print "Couldn't parse top-level right aligned tag: " + str( t) elif t.has_key('class') and t['class'] == 'largeHeading': self.add_large_heading(tidy_string(non_tag_data_in(t))) elif self.something_centered(t) or self.c1_heading(t): # Centred tags are headings for questions... s = tidy_string(non_tag_data_in(t)) if len(s) > 0: self.complete_answer() if verbose: print "center: " + s self.add_heading(s) elif t.name == 'table': # This is probably a table that's inserted just to # right align the question ID. The left cell may # contain something to indicate that it's a # holding answer. if self.find_id_and_possible_holding_date(t): # Then also look for the "Holding answer # issued" details... s = non_tag_data_in(t) self.find_holding_answer_issued(s) self.complete_question() else: # Then maybe it's a table as part of the # answer, so add it as a paragraph. self.add_paragraph(str(t)) elif t.name == 'p': if re.search( "(The following questions were given holding answers|Questions given holding answers)", tidy_string(non_tag_data_in(t))): if verbose: print "Found the trailing holding question list!" # This indicates the end of the day's report # for us (just ignore the following list of # answers - it's not very interesting until we # parse some later day and we can tell what # the question was...) break if verbose: print "Didn't find the trailing holding question list in: " + non_tag_data_in( t) non_empty_contents = filter( lambda x: x.__class__ != NavigableString or not re. match('^\s*$', x), t.contents) if len(non_empty_contents) == 0: continue initial_strong_text = '' while len(non_empty_contents) > 0 and non_empty_contents[ 0].__class__ == Tag and ( non_empty_contents[0].name == 'strong' or non_empty_contents[0].name == 'b'): initial_strong_text += " " + non_tag_data_in( non_empty_contents[0]) non_empty_contents = non_empty_contents[1:] if len(initial_strong_text) > 0: speaker_name = tidy_string(initial_strong_text) # In some files this will be the ID (possibly # plus holding indication), not right aligned # as usual :( if self.find_id_and_possible_holding_date( speaker_name): self.complete_question() else: speaker_name = re.sub('(?ims)\s*:\s*$', '', speaker_name) person_id = self.valid_speaker(speaker_name) if speaker_name and person_id: self.complete_answer() self.set_speaker(speaker_name, person_id) for e in non_empty_contents: s = tidy_string(str(e)) self.add_to_paragraph(s) else: self.add_paragraph_removing_enclosure(t) else: self.add_paragraph_removing_enclosure(t) elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center': # Just add them in a paragraph anyway, even though # that wouldn't be valid HTML 4 strict in the case # of the last three (IIRC) self.add_paragraph(str(t)) else: # Well, if it's empty of text we don't care... s = non_tag_data_in(t) if not re.match('(?ims)^\s*$', s): raise Exception, "Unknown tag found of name '" + t.name + "' with text: " + t.prettify( ) self.complete_answer() # Now output all the XML, working out IDs for each element. # IDs are of the form: # # uk.org.publicwhip/spwa/YYYY-MM-DD.X.T # # .... where: # - YYYY-MM-DD is an ISO 8601 date # # - X is a integer starting at 0 on each day, which # should be incremented for each new heading and # be the same for a group of questions and their # answer. # # - T is "mh" or "h" for major and minor headings, # "q0", "q1", "q2", etc. for each group of # questions and "r0", "r1", etc. for the answers x = -1 last_heading = None current_sp_id = None index = 0 for i in range(0, len(self.all_stuff)): if i > 0: previous = self.all_stuff[i - 1] else: previous = None if i < (len(self.all_stuff) - 1): next = self.all_stuff[i + 1] else: next = None a = self.all_stuff[i] self.ofp.write('\n\n') if a.__class__ == Heading: last_was_answer = True if a.major: subtype = "mh" else: subtype = "h" if next and next.__class__ == QuestionOrReply and next.sp_id: # Then use the question's sp_id: self.ofp.write(a.to_xml(self.get_id(next.sp_id, subtype))) else: x += 1 self.ofp.write(a.to_xml(self.get_id(str(x), subtype))) last_heading = a elif a.__class__ == QuestionOrReply: # Occasionally we think questions are actually # answers, so check the beginning of the first # paragraph: if not a.is_question and len(a.paragraphs) > 0 and re.search( '^(?ims)\s*To\s+ask', a.paragraphs[0]): a.is_question = True # If we're suddenly in an answer, reset index. if (not a.is_question) and previous and not ( previous.__class__ == QuestionOrReply and not previous.is_question): index = 0 # If we're suddenly in a question, reset index and increment x unless the previous is a heading elif a.is_question: if previous: if previous.__class__ == QuestionOrReply: if previous.is_question: # If the one before is a question, that's fine. current_sp_id = a.sp_id else: current_sp_id = a.sp_id # If the previous one was an answer # then we need to replay the last # heading: if not last_heading: raise Exception, "Somehow there's been no heading so far." last_heading.sp_name = a.sp_name if current_sp_id: self.ofp.write( last_heading.to_xml( self.get_id(current_sp_id, "h"))) else: x += 1 self.ofp.write( last_heading.to_xml( self.get_id(str(x), "h"))) self.ofp.write("\n\n") index = 0 else: # i.e. this is the normal case, a question after a heading: current_sp_id = a.sp_id index = 0 else: raise Exception, "Nothing before the first question (no heading)" if a.is_question: subtype = "q" + str(index) else: subtype = "r" + str(index) if current_sp_id: self.ofp.write( a.to_xml(self.get_id(current_sp_id, subtype))) else: self.ofp.write(a.to_xml(self.get_id(str(x), subtype))) index += 1 self.ofp.write("</publicwhip>") self.ofp.close() retcode = call(["mv", temp_output_filename, output_filename]) if retcode != 0: raise Exception, "Moving " + temp_output_filename + " to " + output_filename + " failed." xmlvalidate.parse(output_filename) #retcode = call( [ "xmlstarlet", "val", output_filename ] ) #if retcode != 0: # raise Exception, "Validating "+output_filename+" for well-formedness failed." fil = open('%schangedates.txt' % xml_output_directory, 'a+') fil.write('%d,spwa%s.xml\n' % (time.time(), self.date)) fil.close()
day_body = day_soup.find('body') if day_body: page_as_text = non_tag_data_in(day_body) else: error = "File couldn't be parsed by MinimalSoup: "+day_filename raise Exception, error # Now guess the date from the file contents as well: m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text) if not m: m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text) if m: day_of_week = m.group(2) day = m.group(3) month = month_name_to_int(m.group(4)) if month == 0: print "Whole match was '" + str(m.group(0)) + "'" raise Exception, "Month name '"+m.group(4)+"' not known in file: "+day_filename else: year = m.group(6) # Sometimes the date string doesn't have the year: if not year: m = re.search('(?i)day-[ab]b-(\d\d)',day_filename) if m.group(1) == '99': year = '1999' else: year = '20' + m.group(1) date_from_filecontents = datetime.date( int(year,10), month, int(day,10) ) if date_from_filename == date_from_filecontents:
else: error = "File couldn't be parsed by MinimalSoup: " + day_filename raise Exception, error # Now guess the date from the file contents as well: m = re.search( '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?', page_as_text) if not m: m = re.search( '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?', page_as_text) if m: day_of_week = m.group(2) day = m.group(3) month = month_name_to_int(m.group(4)) if month == 0: print "Whole match was '" + str(m.group(0)) + "'" raise Exception, "Month name '" + m.group( 4) + "' not known in file: " + day_filename else: year = m.group(6) # Sometimes the date string doesn't have the year: if not year: m = re.search('(?i)day-[ab]b-(\d\d)', day_filename) if m.group(1) == '99': year = '1999' else: year = '20' + m.group(1) date_from_filecontents = datetime.date(int(year, 10), month, int(day, 10))
for t in link_tags: # old format link - /or-10/sor1223-01.htm # new format link - http://www.scottish.parliament.uk/Apps2/Business/ORSearch/ReportView.aspx?r=6132&mode=html if t.has_key('href') and (re.match('^or-',t['href']) or re.search('ORSearch/ReportView.aspx.*?mode=html', t['href'])): # print t s = "" for c in t.contents: if type(c) == NavigableString: s = s + str(c) s = re.sub(',','',s) # print year_index_filename + "==> " + s d = None m = re.match( '^(Official Report)?\s*(\d+)\s+(\w+)', s ) if not m: raise Exception, "Unrecognized date format in '%s'" % s d = datetime.date( year, month_name_to_int(m.group(3)), int(m.group(2)) ) page = str(t['href']) contents_url = official_reports_prefix + page contents_last_modified = None if fetched_urls_hash.has_key(contents_url): continue output_filename = official_report_template % ( str(d), 0 ) if not os.path.exists(output_filename): if options.verbose: print "Fetching %s" % contents_url ur = urllib.urlopen(contents_url) if ur.info().has_key('last-modified'): contents_last_modified = ur.info()['last-modified']