コード例 #1
0
 def find_id_and_possible_holding_date(self, tag_or_string):
     if tag_or_string.__class__ == Tag:
         if verbose: print "Parsing tag: " + str(tag_or_string)
         s = tidy_string(non_tag_data_in(tag_or_string))
     else:
         if verbose: print "Parsing string: " + tag_or_string
         s = tag_or_string
     rest = self.find_holding_answer_issued(s)
     if rest:
         s = rest
     if len(s) > 0:
         m = re.search('\((S[0-9][A-Z0-9]+\-) ?([0-9]+)\)', s)
         if m:
             sp_id = m.group(1) + m.group(2)
             self.set_id(sp_id)
             return True
     return False
コード例 #2
0
 def find_id_and_possible_holding_date(self,tag_or_string):
     if tag_or_string.__class__ == Tag:
         if verbose: print "Parsing tag: "+str(tag_or_string)
         s = tidy_string(non_tag_data_in(tag_or_string))
     else:
         if verbose: print "Parsing string: "+tag_or_string
         s = tag_or_string
     rest = self.find_holding_answer_issued(s)
     if rest:
         s = rest            
     if len(s) > 0:
         m = re.search('\((S[0-9][A-Z0-9]+\-) ?([0-9]+)\)',s)
         if m:
             sp_id = m.group(1) + m.group(2)
             self.set_id(sp_id)
             return True
     return False
コード例 #3
0
ファイル: parse-motions.py プロジェクト: samknight/parlparse
 #   em- and en- dash
 html = re.sub('—','—',html)
 html = re.sub('–','–',html)
 #   left and right single quotes
 html = re.sub('’','’',html)
 html = re.sub('‘','‘',html)
 #   pound sterling signs
 html = re.sub('\xA3','£', html)
 spids = re.findall(spid_motion_re,html)
 spids_found_here = {}
 soup = BeautifulSoup(html)
 body = soup.find('body')
 if not body:
     raise Exception, "Couldn't find the body in " + filename
 for s in body.findAll('strong'):
     s_as_text = non_tag_data_in(s)
     m = re.search(spid_motion_at_start_re,s_as_text)
     if m:
         spids_found_here[m.group(1)] = s.parent
 for p in body.findAll('p'):
     s_as_text = non_tag_data_in(p)
     m = re.search(spid_motion_at_start_re,s_as_text)
     if m:
         spids_found_here[m.group(1)] = p
 for spid, element in spids_found_here.iteritems():
     if options.verbose: print "    " + spid
     motion_elements = list()
     current_element = element
     while True:
         motion_elements.append(current_element)
         current_element = current_element.nextSibling
コード例 #4
0
    def parse(self,filename):

        m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))',filename)
        if not m:
            raise Exception, "Couldn't parse filename: "+filename
        self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % (m.group(2),m.group(3))

        filename_leaf = m.group(1)

        # We need to know what date this is, so deal with that first
        # of all in a brutish fashion, but cache the results:

        self.date = None

        if file_to_date.has_key(filename_leaf):
            if verbose: print "Found file to date mapping in cache."
            self.date = datetime.date(*strptime(file_to_date[filename_leaf],"%Y-%m-%d")[0:3])
        else:
            self.make_soup(filename)            
            page_as_text = tidy_string(non_tag_data_in(self.soup.find('body')))
            m = re.search('(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?',page_as_text)
            if m:
                day_of_week = m.group(1)
                day = m.group(2)
                month = month_name_to_int(m.group(3))
                year = m.group(4)
                # Sometimes the date string doesn't have the year:
                if not year:
                    m = re.search('day-wa-(\d\d)',filename)
                    if m.group(1) == '99':
                        year = '1999'
                    else:
                        year = '20' + m.group(1)
                self.date = datetime.date( int(year,10), month, int(day,10) )
                if not options.quiet: "Adding file to date mapping to cache."
                add_file_to_date_mapping(filename_leaf,str(self.date))
            else:
                raise Exception, "No date found in file: "+filename

        temp_output_filename = xml_output_directory + "tmp.xml"
        output_filename = xml_output_directory + "spwa" + str(self.date) + ".xml"

        if os.path.exists(output_filename):
            #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename)
            # raise Exception, error
            #if not options.quiet: print error
            return

        if not options.quiet: print "Parsing %s" % filename

        self.make_soup(filename)

        self.ofp = open(temp_output_filename,"w")

        self.ofp.write('''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE publicwhip [

<!ENTITY pound   "&#163;">
<!ENTITY euro    "&#8364;">

<!ENTITY agrave  "&#224;">
<!ENTITY aacute  "&#225;">
<!ENTITY egrave  "&#232;">
<!ENTITY eacute  "&#233;">
<!ENTITY ecirc   "&#234;">
<!ENTITY iacute  "&#237;">
<!ENTITY ograve  "&#242;">
<!ENTITY oacute  "&#243;">
<!ENTITY uacute  "&#250;">
<!ENTITY Aacute  "&#193;">
<!ENTITY Eacute  "&#201;">
<!ENTITY Iacute  "&#205;">
<!ENTITY Oacute  "&#211;">
<!ENTITY Uacute  "&#218;">
<!ENTITY Uuml    "&#220;">
<!ENTITY auml    "&#228;">
<!ENTITY euml    "&#235;">
<!ENTITY iuml    "&#239;">
<!ENTITY ouml    "&#246;">
<!ENTITY uuml    "&#252;">
<!ENTITY fnof    "&#402;">
<!ENTITY aelig   "&#230;">
<!ENTITY dagger  "&#8224;">
<!ENTITY reg     "&#174;">
<!ENTITY nbsp    "&#160;">
<!ENTITY shy     "&#173;">
<!ENTITY deg     "&#176;">
<!ENTITY middot  "&#183;">
<!ENTITY ordm    "&#186;">
<!ENTITY ndash   "&#8211;">
<!ENTITY mdash   "&#8212;">
<!ENTITY lsquo   "&#8216;">
<!ENTITY rsquo   "&#8217;">
<!ENTITY ldquo   "&#8220;">
<!ENTITY rdquo   "&#8221;">
<!ENTITY hellip  "&#8230;">
<!ENTITY bull    "&#8226;">

<!ENTITY acirc   "&#226;">
<!ENTITY Agrave  "&#192;">
<!ENTITY Aring   "&#197;">
<!ENTITY aring   "&#229;">
<!ENTITY atilde  "&#227;">
<!ENTITY Ccedil  "&#199;">
<!ENTITY ccedil  "&#231;">
<!ENTITY Egrave  "&#200;">
<!ENTITY Icirc   "&#206;">
<!ENTITY icirc   "&#238;">
<!ENTITY Igrave  "&#204;">
<!ENTITY igrave  "&#236;">
<!ENTITY ntilde  "&#241;">
<!ENTITY ocirc   "&#244;">
<!ENTITY oelig   "&#339;">
<!ENTITY Ograve  "&#210;">
<!ENTITY Oslash  "&#216;">
<!ENTITY oslash  "&#248;">
<!ENTITY Scaron  "&#352;">
<!ENTITY scaron  "&#353;">
<!ENTITY sup1    "&#185;">
<!ENTITY sup2    "&#178;">
<!ENTITY sup3    "&#179;">
<!ENTITY ugrave  "&#249;">
<!ENTITY ucirc   "&#251;">
<!ENTITY Ugrave  "&#217;">
<!ENTITY yacute  "&#253;">
<!ENTITY frac12  "&#189;">
<!ENTITY micro   "&#181;">
<!ENTITY sbquo   "&#8218;">
<!ENTITY trade   "&#8482;">
<!ENTITY Dagger  "&#8225;">
<!ENTITY radic   "&#8730;">
]>

<publicwhip>

''')

        self.ofp.write("<source url=\"%s\"/>" % self.original_url )
        
        tag_with_most_paragraphs = None
        most_paragraphs_so_far = -1
        
        for t in self.soup.findAll(True):
            ps = paragraphs_in_tag(t)
            if ps > most_paragraphs_so_far:
                tag_with_most_paragraphs = t
                most_paragraphs_so_far = ps
        
        if verbose: print "Using element name: "+tag_with_most_paragraphs.name+" with "+str(most_paragraphs_so_far)+" paragraphs from "+filename
        
        if verbose: print tag_with_most_paragraphs.prettify()
        
        # When we're parsing we might have multiple questions in a
        # row.  We say that something's a question rather than an
        # answer if (a) it's followed by an ID or (b) it begins with
        # "To ask", otherwise it's an answer.  If we hit a new
        # heading, that suggests that the previous thing was an answer
        # as well.
        
        # The business of "Holding answers" is a bit confusing.  At
        # the bottom of each page there may be a list of question IDs
        # which were given holding answers, but the text of the
        # question is not in the page - you only find it when the
        # question is eventually answered.
        
        for t in tag_with_most_paragraphs:
            if t.__class__ == NavigableString:
                s = str(t)
                s = re.sub('(?ims)\s+',' ',s)
                if re.match('(?ims)^\s*$',s):
                    continue
                else:
                    self.add_to_paragraph(tidy_string(str(t)))
                if verbose: print "string: "+str(s)            
            elif t.__class__ == Tag:
                # Look for any <a name=""> tags in here:
                a = t.find( lambda p: p.name == 'a' and p.has_key('name') )
                if a:
                    self.sp_name = a['name']
                if t.has_key('align') and t['align'].lower() == 'right':
                    # Right aligned tags just have the question ID.
                    if self.find_id_and_possible_holding_date(t):
                        self.complete_question()
                    else:
                        if verbose: print "Couldn't parse top-level right aligned tag: "+str(t)
                elif t.has_key('class') and t['class'] == 'largeHeading':
                    self.add_large_heading(tidy_string(non_tag_data_in(t)))
                elif self.something_centered(t) or self.c1_heading(t):
                    # Centred tags are headings for questions...
                    s = tidy_string(non_tag_data_in(t))
                    if len(s) > 0:
                        self.complete_answer()
                        if verbose: print "center: "+s
                        self.add_heading(s)
                elif t.name == 'table':
                    # This is probably a table that's inserted just to
                    # right align the question ID.  The left cell may
                    # contain something to indicate that it's a
                    # holding answer.
                    if self.find_id_and_possible_holding_date(t):
                        # Then also look for the "Holding answer
                        # issued" details...
                        s = non_tag_data_in(t)
                        self.find_holding_answer_issued(s)
                        self.complete_question()
                    else:
                        # Then maybe it's a table as part of the
                        # answer, so add it as a paragraph.
                        self.add_paragraph(str(t))
                elif t.name == 'p':
                    if re.search("(The following questions were given holding answers|Questions given holding answers)",tidy_string(non_tag_data_in(t))):
                        if verbose: print "Found the trailing holding question list!"
                        # This indicates the end of the day's report
                        # for us (just ignore the following list of
                        # answers - it's not very interesting until we
                        # parse some later day and we can tell what
                        # the question was...)                        
                        break
                    if verbose: print "Didn't find the trailing holding question list in: "+non_tag_data_in(t)
                    non_empty_contents = filter( lambda x: x.__class__ != NavigableString or not re.match('^\s*$',x), t.contents )
                    if len(non_empty_contents) == 0:
                        continue
                    initial_strong_text = ''
                    while len(non_empty_contents) > 0 and non_empty_contents[0].__class__ == Tag and (non_empty_contents[0].name == 'strong' or non_empty_contents[0].name == 'b'):
                        initial_strong_text += " " + non_tag_data_in(non_empty_contents[0])
                        non_empty_contents = non_empty_contents[1:]
                    if len(initial_strong_text) > 0:
                        speaker_name = tidy_string(initial_strong_text)
                        # In some files this will be the ID (possibly
                        # plus holding indication), not right aligned
                        # as usual :(
                        if self.find_id_and_possible_holding_date(speaker_name):
                            self.complete_question()
                        else:
                            speaker_name = re.sub('(?ims)\s*:\s*$','',speaker_name)
                            speaker_id = self.valid_speaker(speaker_name)
                            if speaker_name and speaker_id:
                                self.complete_answer()
                                self.set_speaker(speaker_name,speaker_id)
                                for e in non_empty_contents:
                                    s = tidy_string(str(e))
                                    self.add_to_paragraph(s)
                            else:
                                self.add_paragraph_removing_enclosure(t)
                    else:
                        self.add_paragraph_removing_enclosure(t)
                elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center':
                    # Just add them in a paragraph anyway, even though
                    # that wouldn't be valid HTML 4 strict in the case
                    # of the last three (IIRC)
                    self.add_paragraph(str(t))
                else:
                    # Well, if it's empty of text we don't care...
                    s = non_tag_data_in(t)
                    if not re.match('(?ims)^\s*$',s):
                        raise Exception, "Unknown tag found of name '"+t.name+"' with text: "+t.prettify()
        self.complete_answer()

        # Now output all the XML, working out IDs for each element.
        # IDs are of the form:
        # 
        #   uk.org.publicwhip/spwa/YYYY-MM-DD.X.T
        # 
        #     .... where:
        #            - YYYY-MM-DD is an ISO 8601 date
        # 
        #            - X is a integer starting at 0 on each day, which
        #              should be incremented for each new heading and
        #              be the same for a group of questions and their
        #              answer.
        #
        #            - T is "mh" or "h" for major and minor headings,
        #             "q0", "q1", "q2", etc. for each group of
        #             questions and "r0", "r1", etc. for the answers

        x = -1
        last_heading = None
        current_sp_id = None

        index = 0

        for i in range(0,len(self.all_stuff)):

            if i > 0:
                previous = self.all_stuff[i-1]
            else:
                previous = None

            if i < (len(self.all_stuff) - 1):
                next = self.all_stuff[i+1]
            else:
                next = None
                
            a = self.all_stuff[i]

            self.ofp.write('\n\n')

            if a.__class__ == Heading:
                last_was_answer = True
                if a.major:
                    subtype = "mh"
                else:
                    subtype = "h"
                if next and next.__class__ == QuestionOrReply and next.sp_id:
                    # Then use the question's sp_id:
                    self.ofp.write(a.to_xml(self.get_id(next.sp_id,subtype)))
                else:
                    x += 1
                    self.ofp.write(a.to_xml(self.get_id(str(x),subtype)))
                last_heading = a
            elif a.__class__ == QuestionOrReply:
                # Occasionally we think questions are actually
                # answers, so check the beginning of the first
                # paragraph:
                if not a.is_question and len(a.paragraphs) > 0 and re.search('^(?ims)\s*To\s+ask',a.paragraphs[0]):
                    a.is_question = True
                # If we're suddenly in an answer, reset index.
                if (not a.is_question) and previous and not (previous.__class__ == QuestionOrReply and not previous.is_question):
                    index = 0
                # If we're suddenly in a question, reset index and increment x unless the previous is a heading
                elif a.is_question:
                    if previous:
                        if previous.__class__ == QuestionOrReply:
                            if previous.is_question:
                                # If the one before is a question, that's fine.
                                current_sp_id = a.sp_id
                            else:
                                current_sp_id = a.sp_id
                                # If the previous one was an answer
                                # then we need to replay the last
                                # heading:
                                if not last_heading:
                                    raise Exception, "Somehow there's been no heading so far."
                                last_heading.sp_name = a.sp_name
                                if current_sp_id:
                                    self.ofp.write(last_heading.to_xml(self.get_id(current_sp_id,"h")))
                                else:
                                    x += 1
                                    self.ofp.write(last_heading.to_xml(self.get_id(str(x),"h")))
                                self.ofp.write("\n\n")
                                index = 0
                        else:
                            # i.e. this is the normal case, a question after a heading:
                            current_sp_id = a.sp_id
                            index = 0
                    else:
                        raise Exception, "Nothing before the first question (no heading)"
                if a.is_question:
                    subtype = "q" + str(index)
                else:
                    subtype = "r" + str(index)
                if current_sp_id:
                    self.ofp.write(a.to_xml(self.get_id(current_sp_id,subtype)))
                else:
                    self.ofp.write(a.to_xml(self.get_id(str(x),subtype)))
                index += 1

        self.ofp.write("</publicwhip>")
        self.ofp.close()

        retcode = call( [ "mv", temp_output_filename, output_filename ] )
        if retcode != 0:
            raise Exception, "Moving "+temp_output_filename+" to "+output_filename+" failed."

        xmlvalidate.parse(output_filename)
        #retcode = call( [ "xmlstarlet", "val", output_filename ] )
        #if retcode != 0:
        #    raise Exception, "Validating "+output_filename+" for well-formedness failed."

        fil = open('%schangedates.txt' % xml_output_directory, 'a+')
        fil.write('%d,spwa%s.xml\n' % (time.time(), self.date))
        fil.close()
コード例 #5
0
def parse_html(session, report_date, soup, page_id, original_url):
    divnumber = 0
    report_view = soup.find('div', attrs={'id': 'ReportView'})
    div_children_of_report_view = report_view.findChildren('div',
                                                           recursive=False)
    if len(div_children_of_report_view) != 1:
        raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % (
            len(div_children_of_report_view), page_id)

    Speech.reset_speakers_so_far()

    main_div = div_children_of_report_view[0]

    top_level_divs = main_div.findChildren('div', recursive=False)

    # The first div should just contain links to sections further down
    # the page:

    contents_div, text_div = top_level_divs

    # Just check that my assumption that the first div only contains
    # links is correct:

    contents_tuples = []

    contents_links = contents_div.findAll(True)
    for link in contents_links:
        if link.name == 'br':
            continue
        if link.name != 'a':
            raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % (
                page_id, )
        href = link['href']
        m = re.search(r'#(.*)', href)
        if not m:
            raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % (
                href, page_id)
        contents_tuples.append(
            (m.group(1), tidy_string(non_tag_data_in(link))))

    parsed_page = ParsedPage(session, report_date, page_id)

    # Now consider the div that actually has text in it.  Each speech
    # is in its own div, while the rest that we care about are
    # headings:

    current_votes = None
    current_division_way = None
    current_time = None
    current_url = original_url

    for top_level in text_div:
        # There are sometimes some empty NavigableString elements at
        # the top level, so just ignore those:
        if not len(unicode(top_level).strip()):
            continue
        if top_level.name == 'h2':
            section_title = tidy_string(
                non_tag_data_in(top_level, tag_replacement=u' '))
            if not section_title:
                raise Exception, "There was an empty section title in page ID: %d" % (
                    page_id)
            parsed_page.sections.append(Section(section_title, current_url))
        elif top_level.name in ('br', ):
            # Ignore line breaks - we use paragraphs instead
            continue
        elif top_level.name == 'a':
            try:
                current_url = original_url + "#" + top_level['id']
            except KeyError:
                pass
        elif top_level.name == 'div':
            # This div contains a speech, essentially:

            # the new style pages wraps speeches in p.span tags that we can ignore so
            # remove them from the tree. Occasionally there are multiple spans in a p
            # hence the for loop
            # This does mean we are losing some formatting information but because it's
            # hardcoded style attributes in the spans it's arguable that we'd want to
            # remove them anyway.
            for p in top_level.findChildren('p'):
                if p.span:
                    for span in p.findChildren('span'):
                        span.unwrap()
                    p.unwrap()

            removed_number = None
            for speech_part in top_level:
                if hasattr(speech_part, 'name') and speech_part.name != None:
                    if speech_part.name == 'b':
                        speaker_name = non_tag_data_in(speech_part)
                        # If there's a leading question number remove that (and any whitespace)
                        match = re.match(r'^\d+\.\s*', speaker_name)
                        if match:
                            speaker_name = re.sub(r'^\d+\.\s*', '',
                                                  speaker_name)
                            removed_number = match.group(0)
                        # If there's a training colon, remove that (and any whitespace)
                        speaker_name = re.sub(r'[\s:]*$', '', speaker_name)
                        current_speech = Speech(tidy_string(speaker_name),
                                                report_date, current_time,
                                                current_url)
                        parsed_page.sections[-1].speeches_and_votes.append(
                            current_speech)
                    elif speech_part.name == 'br':
                        # Ignore the line breaks...
                        pass
                    elif speech_part.name == 'ul':
                        current_speech.paragraphs.append(speech_part.html)
                    elif speech_part.name == 'a' and speech_part.text == '':
                        # skip empty a anchors
                        pass
                    else:
                        raise Exception, "Unexpected tag '%s' in page ID: %d" % (
                            speech_part.name, page_id)
                elif isinstance(speech_part, NavigableString):
                    tidied_paragraph = tidy_string(speech_part)
                    if tidied_paragraph == "":
                        # just ignore blank lines
                        continue
                    # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph)
                    division_way, division_candidate, division_candidate_id = is_division_way(
                        tidied_paragraph, report_date)
                    member_vote = is_member_vote(
                        tidied_paragraph,
                        report_date,
                        expecting_a_vote=current_votes)
                    maybe_time = just_time(tidied_paragraph)
                    closed_time = meeting_closed(tidied_paragraph)
                    if closed_time:
                        current_time = closed_time
                    suspended_time_tuple = meeting_suspended(tidied_paragraph)
                    if suspended_time_tuple:
                        suspended, suspension_time_type, suspension_time = suspended_time_tuple
                    else:
                        suspended = False
                        suspension_time_type = suspension_time = None
                    if division_way:
                        # If this is a vote for a particular
                        # candidate, or the introduction to an
                        # oath-taking, add the text as a speech too:
                        if division_candidate:
                            current_speech = Speech(None, report_date,
                                                    current_time, current_url)
                            parsed_page.sections[-1].speeches_and_votes.append(
                                current_speech)
                            current_speech.paragraphs.append(tidied_paragraph)
                        if (not current_votes) or (current_votes.candidate !=
                                                   division_candidate):
                            current_votes = Division(report_date, current_url,
                                                     divnumber,
                                                     division_candidate,
                                                     division_candidate_id)
                            divnumber += 1
                            parsed_page.sections[-1].speeches_and_votes.append(
                                current_votes)
                        current_division_way = division_way
                    elif member_vote:
                        if current_votes is None:
                            raise Exception, "Got a member's vote before an indication of which way the vote is"
                        current_votes.add_vote(current_division_way,
                                               tidied_paragraph, member_vote)
                    elif maybe_time:
                        current_time = maybe_time
                    else:
                        if current_votes:
                            current_votes = None
                        # If this speech doesn't have any paragraphs
                        # yet, make sure that it has the current time,
                        # so that (for example) if we get a "Meeting
                        # closed at 17:44." at the end, that speech
                        # ends up with that time.
                        if len(current_speech.paragraphs) == 0:
                            current_speech.last_time = current_time
                        if removed_number and tidied_paragraph:
                            tidied_paragraph = removed_number + tidied_paragraph
                            removed_number = None
                        current_speech.paragraphs.append(tidied_paragraph)
                    if suspended and suspension_time:
                        current_time = suspension_time
                else:
                    raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % (
                        speech_part, page_id)

        else:
            raise Exception, "There was an unhandled element '%s' in page with ID: %d" % (
                top_level.name, page_id)

    return parsed_page
コード例 #6
0
def is_division_way(element, report_date=None):
    """If it's a division heading, return a normalized version, otherwise None

    >>> is_division_way('  For ')
    ('FOR', None, None)
    >>> is_division_way('nonsense')
    (None, None, None)
    >>> is_division_way('abstentions ')
    ('ABSTENTIONS', None, None)
    >>> is_division_way(":\xA0FOR")
    ('FOR', None, None)
    >>> is_division_way('Abstention')
    ('ABSTENTIONS', None, None)
    >>> is_division_way('Absentions')
    ('ABSTENTIONS', None, None)
    >>> example_date = datetime.date(1999, 5, 13)
    >>> is_division_way('VOTES FOR DONALD DEWAR', example_date)
    ('FOR', 'Donald Dewar', u'uk.org.publicwhip/member/80147')
    >>> is_division_way('now cast your votes for someone', example_date)
    (None, None, None)
    >>> example_date = datetime.date(2000, 3, 14)
    >>> is_division_way('For Mr Kenneth Macintosh', example_date)
    ('FOR', 'Mr Kenneth Macintosh', u'uk.org.publicwhip/member/80191')
    >>> is_division_way('For option 1', example_date)
    ('FOR', 'Option 1', None)
    >>> is_division_way('The following member took the oath:')
    ('FOR', 'oath', None)
    >>> is_division_way('The following member made a solemn affirmation:')
    ('FOR', 'affirmation', None)
    >>> is_division_way('The following member made a solemn affirmation and repeated it in French:')
    ('FOR', 'affirmation', None)
    """

    tidied = tidy_string(non_tag_data_in(element)).upper()
    # Strip any non-word letters at the start and end:
    tidied = re.sub(r'^\W*(.*?)\W*$', '\\1', tidied)

    if tidied in DIVISION_HEADINGS:
        return (tidied, None, None)
    elif tidied in ('ABSTENTION', 'ABSENTIONS'):
        return ('ABSTENTIONS', None, None)
    elif re.search(
            '^THE FOLLOWING MEMBERS? TOOK THE OATH( AND REPEATED IT IN .*)?:?$',
            tidied):
        return ('FOR', 'oath', None)
    elif re.search(
            '^THE FOLLOWING MEMBERS? MADE A SOLEMN AFFIRMATION( AND REPEATED IT IN .*)?:?$',
            tidied):
        return ('FOR', 'affirmation', None)
    elif len(tidied.split()) < 128:
        # The second regular expression could be *very* slow on
        # strings that begin 'FOR', so only try it on short strings
        # that might be introducing a division, and assume that there
        # are 2 to 4 words in the name:
        m1 = re.search(r'^(?i)VOTES? FOR ([A-Z ]+)$', tidied)
        m2 = re.search(r'^FOR ((?:[A-Z]+\s*){2,4})$', tidied)
        m = m1 or m2
        if m:
            person_name = m.group(1).title()
            person_id = None
            if report_date:
                person_id = get_unique_person_id(person_name, report_date)
            return ('FOR', person_name, person_id)
        else:
            m = re.search(r'FOR OPTION (\d+)$', tidied)
            if m:
                return ('FOR', 'Option ' + m.group(1), None)
    return (None, None, None)
コード例 #7
0
def is_member_vote(element, vote_date, expecting_a_vote=True):
    """Returns a speaker ID if this looks like a member's vote in a division

    Otherwise returns None.  If it looks like a vote, but the speaker
    can't be identified, this throws an exception.  As an example:

    >>> is_member_vote('Something random...', '2012-11-12')
    >>> is_member_vote('Baillie, Jackie (Dumbarton) (Lab)', '2012-11-12')
    u'uk.org.publicwhip/member/80476'
    >>> is_member_vote('Alexander, Ms Wendy (Paisley North) (Lab)', '2010-05-12')
    u'uk.org.publicwhip/member/80281'
    >>> is_member_vote('Purvis, Jeremy (Tweeddale, Ettrick and Lauderdale)', '2005-05-18')
    u'uk.org.publicwhip/member/80101'

    Now some examples that should be ignored:

    >>> is_member_vote(': SP 440 (EC Ref No 11766/99, COM(99) 473 final)', '1999-11-23')
    >>> is_member_vote('SP 666 (EC Ref No 566 99/0225, COM(99) (CNS))', '2000-02-08')
    >>> is_member_vote('to promote a private bill, the company relied on its general power under section 10(1)(xxxii)', '2006-05-22')

    And one that should throw an exception:

    >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12')
    Traceback (most recent call last):
      ...
    Exception: A voting member 'Jeffrey Lebowski (Los Angeles)' couldn't be resolved

    If expecting_a_vote is False, then don't throw an exception if
    the name can't be resolved:

    >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12', expecting_a_vote=False)

    Also try resolving names that aren't comma-reversed:

    >>> is_member_vote('Brian Adam (North-East Scotland) (SNP)', '1999-11-09')
    u'uk.org.publicwhip/member/80129'

    """
    tidied = tidy_string(non_tag_data_in(element))

    from_first_and_last = lambda m: m and "%s %s (%s)" % (m.group(
        'first_names'), m.group('last_name'), m.group('constituency'))

    from_full = lambda m: m and m.group('full_name')
    vote_matches = ((member_vote_re, from_first_and_last),
                    (member_vote_just_constituency_re, from_first_and_last),
                    (member_vote_fullname_re, from_full))

    reformed_name = first(
        processor(regexp.search(tidied)) for regexp, processor in vote_matches)

    if not reformed_name:
        return None

    person_id = get_unique_person_id(reformed_name, str(vote_date))

    if person_id is None and expecting_a_vote:
        print "reformed_name is:", reformed_name
        print "vote_date is:", vote_date
        raise Exception, "A voting member '%s' couldn't be resolved" % (
            reformed_name, )
    else:
        return person_id
コード例 #8
0
    def parse(self, filename):

        m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))', filename)
        if not m:
            raise Exception, "Couldn't parse filename: " + filename
        self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % (
            m.group(2), m.group(3))

        filename_leaf = m.group(1)

        # We need to know what date this is, so deal with that first
        # of all in a brutish fashion, but cache the results:

        self.date = None

        if file_to_date.has_key(filename_leaf):
            if verbose: print "Found file to date mapping in cache."
            self.date = datetime.date(
                *strptime(file_to_date[filename_leaf], "%Y-%m-%d")[0:3])
        else:
            self.make_soup(filename)
            page_as_text = tidy_string(non_tag_data_in(self.soup.find('body')))
            m = re.search(
                '(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?',
                page_as_text)
            if m:
                day_of_week = m.group(1)
                day = m.group(2)
                month = month_name_to_int(m.group(3))
                year = m.group(4)
                # Sometimes the date string doesn't have the year:
                if not year:
                    m = re.search('day-wa-(\d\d)', filename)
                    if m.group(1) == '99':
                        year = '1999'
                    else:
                        year = '20' + m.group(1)
                self.date = datetime.date(int(year, 10), month, int(day, 10))
                if not options.quiet: "Adding file to date mapping to cache."
                add_file_to_date_mapping(filename_leaf, str(self.date))
            else:
                raise Exception, "No date found in file: " + filename

        temp_output_filename = xml_output_directory + "tmp.xml"
        output_filename = xml_output_directory + "spwa" + str(
            self.date) + ".xml"

        if os.path.exists(output_filename):
            #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename)
            # raise Exception, error
            #if not options.quiet: print error
            return

        if not options.quiet: print "Parsing %s" % filename

        self.make_soup(filename)

        self.ofp = open(temp_output_filename, "w")

        self.ofp.write('''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE publicwhip [

<!ENTITY pound   "&#163;">
<!ENTITY euro    "&#8364;">

<!ENTITY agrave  "&#224;">
<!ENTITY aacute  "&#225;">
<!ENTITY egrave  "&#232;">
<!ENTITY eacute  "&#233;">
<!ENTITY ecirc   "&#234;">
<!ENTITY iacute  "&#237;">
<!ENTITY ograve  "&#242;">
<!ENTITY oacute  "&#243;">
<!ENTITY uacute  "&#250;">
<!ENTITY Aacute  "&#193;">
<!ENTITY Eacute  "&#201;">
<!ENTITY Iacute  "&#205;">
<!ENTITY Oacute  "&#211;">
<!ENTITY Uacute  "&#218;">
<!ENTITY Uuml    "&#220;">
<!ENTITY auml    "&#228;">
<!ENTITY euml    "&#235;">
<!ENTITY iuml    "&#239;">
<!ENTITY ouml    "&#246;">
<!ENTITY uuml    "&#252;">
<!ENTITY fnof    "&#402;">
<!ENTITY aelig   "&#230;">
<!ENTITY dagger  "&#8224;">
<!ENTITY reg     "&#174;">
<!ENTITY nbsp    "&#160;">
<!ENTITY shy     "&#173;">
<!ENTITY deg     "&#176;">
<!ENTITY middot  "&#183;">
<!ENTITY ordm    "&#186;">
<!ENTITY ndash   "&#8211;">
<!ENTITY mdash   "&#8212;">
<!ENTITY lsquo   "&#8216;">
<!ENTITY rsquo   "&#8217;">
<!ENTITY ldquo   "&#8220;">
<!ENTITY rdquo   "&#8221;">
<!ENTITY hellip  "&#8230;">
<!ENTITY bull    "&#8226;">

<!ENTITY acirc   "&#226;">
<!ENTITY Agrave  "&#192;">
<!ENTITY Aring   "&#197;">
<!ENTITY aring   "&#229;">
<!ENTITY atilde  "&#227;">
<!ENTITY Ccedil  "&#199;">
<!ENTITY ccedil  "&#231;">
<!ENTITY Egrave  "&#200;">
<!ENTITY Icirc   "&#206;">
<!ENTITY icirc   "&#238;">
<!ENTITY Igrave  "&#204;">
<!ENTITY igrave  "&#236;">
<!ENTITY ntilde  "&#241;">
<!ENTITY ocirc   "&#244;">
<!ENTITY oelig   "&#339;">
<!ENTITY Ograve  "&#210;">
<!ENTITY Oslash  "&#216;">
<!ENTITY oslash  "&#248;">
<!ENTITY Scaron  "&#352;">
<!ENTITY scaron  "&#353;">
<!ENTITY sup1    "&#185;">
<!ENTITY sup2    "&#178;">
<!ENTITY sup3    "&#179;">
<!ENTITY ugrave  "&#249;">
<!ENTITY ucirc   "&#251;">
<!ENTITY Ugrave  "&#217;">
<!ENTITY yacute  "&#253;">
<!ENTITY frac12  "&#189;">
<!ENTITY micro   "&#181;">
<!ENTITY sbquo   "&#8218;">
<!ENTITY trade   "&#8482;">
<!ENTITY Dagger  "&#8225;">
<!ENTITY radic   "&#8730;">
]>

<publicwhip>

''')

        self.ofp.write("<source url=\"%s\"/>" % self.original_url)

        tag_with_most_paragraphs = None
        most_paragraphs_so_far = -1

        for t in self.soup.findAll(True):
            ps = paragraphs_in_tag(t)
            if ps > most_paragraphs_so_far:
                tag_with_most_paragraphs = t
                most_paragraphs_so_far = ps

        if verbose:
            print "Using element name: " + tag_with_most_paragraphs.name + " with " + str(
                most_paragraphs_so_far) + " paragraphs from " + filename

        if verbose: print tag_with_most_paragraphs.prettify()

        # When we're parsing we might have multiple questions in a
        # row.  We say that something's a question rather than an
        # answer if (a) it's followed by an ID or (b) it begins with
        # "To ask", otherwise it's an answer.  If we hit a new
        # heading, that suggests that the previous thing was an answer
        # as well.

        # The business of "Holding answers" is a bit confusing.  At
        # the bottom of each page there may be a list of question IDs
        # which were given holding answers, but the text of the
        # question is not in the page - you only find it when the
        # question is eventually answered.

        for t in tag_with_most_paragraphs:
            if t.__class__ == NavigableString:
                s = str(t)
                s = re.sub('(?ims)\s+', ' ', s)
                if re.match('(?ims)^\s*$', s):
                    continue
                else:
                    self.add_to_paragraph(tidy_string(str(t)))
                if verbose: print "string: " + str(s)
            elif t.__class__ == Tag:
                # Look for any <a name=""> tags in here:
                a = t.find(lambda p: p.name == 'a' and p.has_key('name'))
                if a:
                    self.sp_name = a['name']
                if t.has_key('align') and t['align'].lower() == 'right':
                    # Right aligned tags just have the question ID.
                    if self.find_id_and_possible_holding_date(t):
                        self.complete_question()
                    else:
                        if verbose:
                            print "Couldn't parse top-level right aligned tag: " + str(
                                t)
                elif t.has_key('class') and t['class'] == 'largeHeading':
                    self.add_large_heading(tidy_string(non_tag_data_in(t)))
                elif self.something_centered(t) or self.c1_heading(t):
                    # Centred tags are headings for questions...
                    s = tidy_string(non_tag_data_in(t))
                    if len(s) > 0:
                        self.complete_answer()
                        if verbose: print "center: " + s
                        self.add_heading(s)
                elif t.name == 'table':
                    # This is probably a table that's inserted just to
                    # right align the question ID.  The left cell may
                    # contain something to indicate that it's a
                    # holding answer.
                    if self.find_id_and_possible_holding_date(t):
                        # Then also look for the "Holding answer
                        # issued" details...
                        s = non_tag_data_in(t)
                        self.find_holding_answer_issued(s)
                        self.complete_question()
                    else:
                        # Then maybe it's a table as part of the
                        # answer, so add it as a paragraph.
                        self.add_paragraph(str(t))
                elif t.name == 'p':
                    if re.search(
                            "(The following questions were given holding answers|Questions given holding answers)",
                            tidy_string(non_tag_data_in(t))):
                        if verbose:
                            print "Found the trailing holding question list!"
                        # This indicates the end of the day's report
                        # for us (just ignore the following list of
                        # answers - it's not very interesting until we
                        # parse some later day and we can tell what
                        # the question was...)
                        break
                    if verbose:
                        print "Didn't find the trailing holding question list in: " + non_tag_data_in(
                            t)
                    non_empty_contents = filter(
                        lambda x: x.__class__ != NavigableString or not re.
                        match('^\s*$', x), t.contents)
                    if len(non_empty_contents) == 0:
                        continue
                    initial_strong_text = ''
                    while len(non_empty_contents) > 0 and non_empty_contents[
                            0].__class__ == Tag and (
                                non_empty_contents[0].name == 'strong'
                                or non_empty_contents[0].name == 'b'):
                        initial_strong_text += " " + non_tag_data_in(
                            non_empty_contents[0])
                        non_empty_contents = non_empty_contents[1:]
                    if len(initial_strong_text) > 0:
                        speaker_name = tidy_string(initial_strong_text)
                        # In some files this will be the ID (possibly
                        # plus holding indication), not right aligned
                        # as usual :(
                        if self.find_id_and_possible_holding_date(
                                speaker_name):
                            self.complete_question()
                        else:
                            speaker_name = re.sub('(?ims)\s*:\s*$', '',
                                                  speaker_name)
                            person_id = self.valid_speaker(speaker_name)
                            if speaker_name and person_id:
                                self.complete_answer()
                                self.set_speaker(speaker_name, person_id)
                                for e in non_empty_contents:
                                    s = tidy_string(str(e))
                                    self.add_to_paragraph(s)
                            else:
                                self.add_paragraph_removing_enclosure(t)
                    else:
                        self.add_paragraph_removing_enclosure(t)
                elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center':
                    # Just add them in a paragraph anyway, even though
                    # that wouldn't be valid HTML 4 strict in the case
                    # of the last three (IIRC)
                    self.add_paragraph(str(t))
                else:
                    # Well, if it's empty of text we don't care...
                    s = non_tag_data_in(t)
                    if not re.match('(?ims)^\s*$', s):
                        raise Exception, "Unknown tag found of name '" + t.name + "' with text: " + t.prettify(
                        )
        self.complete_answer()

        # Now output all the XML, working out IDs for each element.
        # IDs are of the form:
        #
        #   uk.org.publicwhip/spwa/YYYY-MM-DD.X.T
        #
        #     .... where:
        #            - YYYY-MM-DD is an ISO 8601 date
        #
        #            - X is a integer starting at 0 on each day, which
        #              should be incremented for each new heading and
        #              be the same for a group of questions and their
        #              answer.
        #
        #            - T is "mh" or "h" for major and minor headings,
        #             "q0", "q1", "q2", etc. for each group of
        #             questions and "r0", "r1", etc. for the answers

        x = -1
        last_heading = None
        current_sp_id = None

        index = 0

        for i in range(0, len(self.all_stuff)):

            if i > 0:
                previous = self.all_stuff[i - 1]
            else:
                previous = None

            if i < (len(self.all_stuff) - 1):
                next = self.all_stuff[i + 1]
            else:
                next = None

            a = self.all_stuff[i]

            self.ofp.write('\n\n')

            if a.__class__ == Heading:
                last_was_answer = True
                if a.major:
                    subtype = "mh"
                else:
                    subtype = "h"
                if next and next.__class__ == QuestionOrReply and next.sp_id:
                    # Then use the question's sp_id:
                    self.ofp.write(a.to_xml(self.get_id(next.sp_id, subtype)))
                else:
                    x += 1
                    self.ofp.write(a.to_xml(self.get_id(str(x), subtype)))
                last_heading = a
            elif a.__class__ == QuestionOrReply:
                # Occasionally we think questions are actually
                # answers, so check the beginning of the first
                # paragraph:
                if not a.is_question and len(a.paragraphs) > 0 and re.search(
                        '^(?ims)\s*To\s+ask', a.paragraphs[0]):
                    a.is_question = True
                # If we're suddenly in an answer, reset index.
                if (not a.is_question) and previous and not (
                        previous.__class__ == QuestionOrReply
                        and not previous.is_question):
                    index = 0
                # If we're suddenly in a question, reset index and increment x unless the previous is a heading
                elif a.is_question:
                    if previous:
                        if previous.__class__ == QuestionOrReply:
                            if previous.is_question:
                                # If the one before is a question, that's fine.
                                current_sp_id = a.sp_id
                            else:
                                current_sp_id = a.sp_id
                                # If the previous one was an answer
                                # then we need to replay the last
                                # heading:
                                if not last_heading:
                                    raise Exception, "Somehow there's been no heading so far."
                                last_heading.sp_name = a.sp_name
                                if current_sp_id:
                                    self.ofp.write(
                                        last_heading.to_xml(
                                            self.get_id(current_sp_id, "h")))
                                else:
                                    x += 1
                                    self.ofp.write(
                                        last_heading.to_xml(
                                            self.get_id(str(x), "h")))
                                self.ofp.write("\n\n")
                                index = 0
                        else:
                            # i.e. this is the normal case, a question after a heading:
                            current_sp_id = a.sp_id
                            index = 0
                    else:
                        raise Exception, "Nothing before the first question (no heading)"
                if a.is_question:
                    subtype = "q" + str(index)
                else:
                    subtype = "r" + str(index)
                if current_sp_id:
                    self.ofp.write(
                        a.to_xml(self.get_id(current_sp_id, subtype)))
                else:
                    self.ofp.write(a.to_xml(self.get_id(str(x), subtype)))
                index += 1

        self.ofp.write("</publicwhip>")
        self.ofp.close()

        retcode = call(["mv", temp_output_filename, output_filename])
        if retcode != 0:
            raise Exception, "Moving " + temp_output_filename + " to " + output_filename + " failed."

        xmlvalidate.parse(output_filename)
        #retcode = call( [ "xmlstarlet", "val", output_filename ] )
        #if retcode != 0:
        #    raise Exception, "Validating "+output_filename+" for well-formedness failed."

        fil = open('%schangedates.txt' % xml_output_directory, 'a+')
        fil.write('%d,spwa%s.xml\n' % (time.time(), self.date))
        fil.close()
コード例 #9
0
def parse_html(session, report_date, soup, page_id, original_url):
    divnumber = 0
    report_view = soup.find('div', attrs={'id': 'ReportView'})
    div_children_of_report_view = report_view.findChildren('div', recursive=False)
    if len(div_children_of_report_view) != 1:
        raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % (len(div_children_of_report_view), page_id)

    Speech.reset_speakers_so_far()

    main_div = div_children_of_report_view[0]

    top_level_divs = main_div.findChildren('div', recursive=False)

    # The first div should just contain links to sections further down
    # the page:

    contents_div, text_div = top_level_divs

    # Just check that my assumption that the first div only contains
    # links is correct:

    contents_tuples = []

    contents_links = contents_div.findAll(True)
    for link in contents_links:
        if link.name == 'br':
            continue
        if link.name != 'a':
            raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % (page_id,)
        href = link['href']
        m = re.search(r'#(.*)', href)
        if not m:
            raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % (href, page_id)
        contents_tuples.append((m.group(1), tidy_string(non_tag_data_in(link))))

    parsed_page = ParsedPage(session, report_date, page_id)

    # Now consider the div that actually has text in it.  Each speech
    # is in its own div, while the rest that we care about are
    # headings:

    current_votes = None
    current_division_way = None
    current_time = None
    current_url = original_url

    for top_level in text_div:
        # There are sometimes some empty NavigableString elements at
        # the top level, so just ignore those:
        if not len(unicode(top_level).strip()):
            continue
        if top_level.name == 'h2':
            section_title = tidy_string(non_tag_data_in(top_level, tag_replacement=u' '))
            if not section_title:
                raise Exception, "There was an empty section title in page ID: %d" % (page_id)
            parsed_page.sections.append(
                Section(section_title, current_url))
        elif top_level.name in ('br',):
            # Ignore line breaks - we use paragraphs instead
            continue
        elif top_level.name == 'a':
            try:
                current_url = original_url + "#" + top_level['id']
            except KeyError:
                pass
        elif top_level.name == 'div':
            # This div contains a speech, essentially:

            # the new style pages wraps speeches in p.span tags that we can ignore so
            # remove them from the tree. Occasionally there are multiple spans in a p
            # hence the for loop
            # This does mean we are losing some formatting information but because it's
            # hardcoded style attributes in the spans it's arguable that we'd want to
            # remove them anyway.
            for p in top_level.findChildren('p'):
                if p.span:
                    for span in p.findChildren('span'):
                        span.unwrap()
                    p.unwrap()

            removed_number = None
            for speech_part in top_level:
                if hasattr(speech_part, 'name') and speech_part.name != None:
                    if speech_part.name == 'b':
                        speaker_name = non_tag_data_in(speech_part)
                        # If there's a leading question number remove that (and any whitespace)
                        match = re.match(r'^\d+\.\s*', speaker_name)
                        if match:
                            speaker_name = re.sub(r'^\d+\.\s*', '', speaker_name)
                            removed_number = match.group(0)
                        # If there's a training colon, remove that (and any whitespace)
                        speaker_name = re.sub(r'[\s:]*$', '', speaker_name)
                        current_speech = Speech(tidy_string(speaker_name),
                                                report_date,
                                                current_time,
                                                current_url)
                        parsed_page.sections[-1].speeches_and_votes.append(current_speech)
                    elif speech_part.name == 'br':
                        # Ignore the line breaks...
                        pass
                    elif speech_part.name == 'ul':
                        current_speech.paragraphs.append(speech_part.html)
                    elif speech_part.name == 'a' and speech_part.text == '':
                        # skip empty a anchors
                        pass
                    else:
                        raise Exception, "Unexpected tag '%s' in page ID: %d" % (speech_part.name, page_id)
                elif isinstance(speech_part, NavigableString):
                    tidied_paragraph = tidy_string(speech_part)
                    if tidied_paragraph == "":
                        # just ignore blank lines
                        continue
                    # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph)
                    division_way, division_candidate, division_candidate_id = is_division_way(tidied_paragraph, report_date)
                    member_vote = is_member_vote(tidied_paragraph, report_date, expecting_a_vote=current_votes)
                    maybe_time = just_time(tidied_paragraph)
                    closed_time = meeting_closed(tidied_paragraph)
                    if closed_time:
                        current_time = closed_time
                    suspended_time_tuple = meeting_suspended(tidied_paragraph)
                    if suspended_time_tuple:
                        suspended, suspension_time_type, suspension_time = suspended_time_tuple
                    else:
                        suspended = False
                        suspension_time_type = suspension_time = None
                    if division_way:
                        # If this is a vote for a particular
                        # candidate, or the introduction to an
                        # oath-taking, add the text as a speech too:
                        if division_candidate:
                            current_speech = Speech(None,
                                                    report_date,
                                                    current_time,
                                                    current_url)
                            parsed_page.sections[-1].speeches_and_votes.append(current_speech)
                            current_speech.paragraphs.append(tidied_paragraph)
                        if (not current_votes) or (current_votes.candidate != division_candidate):
                            current_votes = Division(report_date, current_url, divnumber, division_candidate, division_candidate_id)
                            divnumber += 1
                            parsed_page.sections[-1].speeches_and_votes.append(current_votes)
                        current_division_way = division_way
                    elif member_vote:
                        if current_votes is None:
                            raise Exception, "Got a member's vote before an indication of which way the vote is"
                        current_votes.add_vote(current_division_way, tidied_paragraph, member_vote)
                    elif maybe_time:
                        current_time = maybe_time
                    else:
                        if current_votes:
                            current_votes = None
                        # If this speech doesn't have any paragraphs
                        # yet, make sure that it has the current time,
                        # so that (for example) if we get a "Meeting
                        # closed at 17:44." at the end, that speech
                        # ends up with that time.
                        if len(current_speech.paragraphs) == 0:
                            current_speech.last_time = current_time
                        if removed_number and tidied_paragraph:
                            tidied_paragraph = removed_number + tidied_paragraph
                            removed_number = None
                        current_speech.paragraphs.append(tidied_paragraph)
                    if suspended and suspension_time:
                        current_time = suspension_time
                else:
                    raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % (speech_part, page_id)

        else:
            raise Exception, "There was an unhandled element '%s' in page with ID: %d" % (top_level.name, page_id)

    return parsed_page
コード例 #10
0
def is_division_way(element, report_date=None):
    """If it's a division heading, return a normalized version, otherwise None

    >>> is_division_way('  For ')
    ('FOR', None, None)
    >>> is_division_way('nonsense')
    (None, None, None)
    >>> is_division_way('abstentions ')
    ('ABSTENTIONS', None, None)
    >>> is_division_way(":\xA0FOR")
    ('FOR', None, None)
    >>> is_division_way('Abstention')
    ('ABSTENTIONS', None, None)
    >>> is_division_way('Absentions')
    ('ABSTENTIONS', None, None)
    >>> example_date = datetime.date(1999, 5, 13)
    >>> is_division_way('VOTES FOR DONALD DEWAR', example_date)
    ('FOR', 'Donald Dewar', u'uk.org.publicwhip/member/80147')
    >>> is_division_way('now cast your votes for someone', example_date)
    (None, None, None)
    >>> example_date = datetime.date(2000, 3, 14)
    >>> is_division_way('For Mr Kenneth Macintosh', example_date)
    ('FOR', 'Mr Kenneth Macintosh', u'uk.org.publicwhip/member/80191')
    >>> is_division_way('For option 1', example_date)
    ('FOR', 'Option 1', None)
    >>> is_division_way('The following member took the oath:')
    ('FOR', 'oath', None)
    >>> is_division_way('The following member made a solemn affirmation:')
    ('FOR', 'affirmation', None)
    >>> is_division_way('The following member made a solemn affirmation and repeated it in French:')
    ('FOR', 'affirmation', None)
    """

    tidied = tidy_string(non_tag_data_in(element)).upper()
    # Strip any non-word letters at the start and end:
    tidied = re.sub(r'^\W*(.*?)\W*$', '\\1', tidied)

    if tidied in DIVISION_HEADINGS:
        return (tidied, None, None)
    elif tidied in ('ABSTENTION', 'ABSENTIONS'):
        return ('ABSTENTIONS', None, None)
    elif re.search('^THE FOLLOWING MEMBERS? TOOK THE OATH( AND REPEATED IT IN .*)?:?$', tidied):
        return ('FOR', 'oath', None)
    elif re.search('^THE FOLLOWING MEMBERS? MADE A SOLEMN AFFIRMATION( AND REPEATED IT IN .*)?:?$', tidied):
        return ('FOR', 'affirmation', None)
    elif len(tidied.split()) < 128:
        # The second regular expression could be *very* slow on
        # strings that begin 'FOR', so only try it on short strings
        # that might be introducing a division, and assume that there
        # are 2 to 4 words in the name:
        m1 = re.search(r'^(?i)VOTES? FOR ([A-Z ]+)$', tidied)
        m2 = re.search(r'^FOR ((?:[A-Z]+\s*){2,4})$', tidied)
        m = m1 or m2
        if m:
            person_name = m.group(1).title()
            person_id = None
            if report_date:
                person_id = get_unique_person_id(person_name, report_date)
            return ('FOR', person_name, person_id)
        else:
            m = re.search(r'FOR OPTION (\d+)$', tidied)
            if m:
                return ('FOR', 'Option ' + m.group(1), None)
    return (None, None, None)
コード例 #11
0
def is_member_vote(element, vote_date, expecting_a_vote=True):
    """Returns a speaker ID if this looks like a member's vote in a division

    Otherwise returns None.  If it looks like a vote, but the speaker
    can't be identified, this throws an exception.  As an example:

    >>> is_member_vote('Something random...', '2012-11-12')
    >>> is_member_vote('Baillie, Jackie (Dumbarton) (Lab)', '2012-11-12')
    u'uk.org.publicwhip/member/80476'
    >>> is_member_vote('Alexander, Ms Wendy (Paisley North) (Lab)', '2010-05-12')
    u'uk.org.publicwhip/member/80281'
    >>> is_member_vote('Purvis, Jeremy (Tweeddale, Ettrick and Lauderdale)', '2005-05-18')
    u'uk.org.publicwhip/member/80101'

    Now some examples that should be ignored:

    >>> is_member_vote(': SP 440 (EC Ref No 11766/99, COM(99) 473 final)', '1999-11-23')
    >>> is_member_vote('SP 666 (EC Ref No 566 99/0225, COM(99) (CNS))', '2000-02-08')
    >>> is_member_vote('to promote a private bill, the company relied on its general power under section 10(1)(xxxii)', '2006-05-22')

    And one that should throw an exception:

    >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12')
    Traceback (most recent call last):
      ...
    Exception: A voting member 'Jeffrey Lebowski (Los Angeles)' couldn't be resolved

    If expecting_a_vote is False, then don't throw an exception if
    the name can't be resolved:

    >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12', expecting_a_vote=False)

    Also try resolving names that aren't comma-reversed:

    >>> is_member_vote('Brian Adam (North-East Scotland) (SNP)', '1999-11-09')
    u'uk.org.publicwhip/member/80129'

    """
    tidied = tidy_string(non_tag_data_in(element))

    from_first_and_last = lambda m: m and "%s %s (%s)" % (m.group('first_names'),
                                                          m.group('last_name'),
                                                          m.group('constituency'))

    from_full = lambda m: m and m.group('full_name')
    vote_matches = (
        (member_vote_re, from_first_and_last),
        (member_vote_just_constituency_re, from_first_and_last),
        (member_vote_fullname_re, from_full))

    reformed_name = first(processor(regexp.search(tidied))
                        for regexp, processor in vote_matches)

    if not reformed_name:
        return None

    person_id = get_unique_person_id(reformed_name, str(vote_date))

    if person_id is None and expecting_a_vote:
        print "reformed_name is:", reformed_name
        print "vote_date is:", vote_date
        raise Exception, "A voting member '%s' couldn't be resolved" % (reformed_name,)
    else:
        return person_id
コード例 #12
0
        filename_year = int('20'+two_digit_year,10)
    try:
        date_from_filename = datetime.date(filename_year,filename_month,filename_day)
    except ValueError:
        date_from_filename = None
        if verbose: print "Date in filename %s-%s-%s" % ( filename_year, filename_month, filename_day )

    # Don't soup it if we don't have to:
    if date_from_filename and date_from_filename < all_after_date:
        continue

    day_soup = MinimalSoup(day_html)

    day_body = day_soup.find('body')
    if day_body:
        page_as_text = non_tag_data_in(day_body)
    else:
        error = "File couldn't be parsed by MinimalSoup: "+day_filename
        raise Exception, error

    # Now guess the date from the file contents as well:
    m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text)
    if not m:
        m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text)
    if m:
        day_of_week = m.group(2)
        day = m.group(3)
        month = month_name_to_int(m.group(4))
        if month == 0:
            print "Whole match was '" + str(m.group(0)) + "'"
            raise Exception, "Month name '"+m.group(4)+"' not known in file: "+day_filename
コード例 #13
0
                                           filename_day)
    except ValueError:
        date_from_filename = None
        if verbose:
            print "Date in filename %s-%s-%s" % (filename_year, filename_month,
                                                 filename_day)

    # Don't soup it if we don't have to:
    if date_from_filename and date_from_filename < all_after_date:
        continue

    day_soup = MinimalSoup(day_html)

    day_body = day_soup.find('body')
    if day_body:
        page_as_text = non_tag_data_in(day_body)
    else:
        error = "File couldn't be parsed by MinimalSoup: " + day_filename
        raise Exception, error

    # Now guess the date from the file contents as well:
    m = re.search(
        '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',
        page_as_text)
    if not m:
        m = re.search(
            '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',
            page_as_text)
    if m:
        day_of_week = m.group(2)
        day = m.group(3)