def scrape_hansard(self, session, chamber, url, hansard_id): subject = None procedure = None speech = None day = None sequence = 1 page = self.lxmlize(url) for para in page.xpath(".//p"): try: klass = para.attrib['class'].strip() except KeyError: continue # Some para entries have no class. if klass == 'SubjectHeading': subject = re.sub("\s+", " ", para.text_content()).strip() if klass == 'ProceduralHeading': procedure = re.sub("\s+", " ", para.text_content()).strip() if klass == 'SpeakerBegins': attribution = [x.text_content().strip() for x in para.xpath(".//span[@class='Attribution']")] # XXX: Check if we have a J. Q. Public: at the begining # to mark as attributed. Early results show fail on picking # that up. if attribution == []: logger.debug("Error: Speaker began without attribution") logger.debug(" URL: %s" % (url)) logger.debug(" Txt: %s" % (para.text_content()[:30])) continue if day is None: logger.debug("Error: Day is None. Bad juju.") logger.debug(url) continue if speech: self.save_object(speech) person = attribution[0] if person.endswith(":"): person = person.rstrip(":") if person == "": logger.debug("Error: empty person string. Bad juju.") continue text = para.text_content() speech = Speech(session, chamber, hansard_id, day, sequence, person, text, subject=subject, section=procedure) speech.add_source(url) sequence += 1 continue if klass == 'SpeakerContinues': if speech is None: logger.debug("Continue before a begin. bad juju.") continue text = para.text_content() speech['text'] += "\n%s" % (text) continue if klass == 'DateOfTranscript' or klass == 'TitlePageDate': date_text = para.text_content().strip().encode( "ascii", "ignore" ) day = dt.datetime.strptime(date_text, "%A, %B %d, %Y") continue if speech: self.save_object(speech)
def scrape_day(self, session, chamber, day_url): doc = self.lxmlize(day_url) date = re.findall('Date=(\d{4}-\d{1,2}-\d{1,2})', day_url)[0] when = datetime.datetime.strptime(date, '%Y-%m-%d') sequence = 0 last_h2 = '' section = '' speech = None transcript = doc.xpath('//div[@id="transcript"]')[0] # skip first item, navgation div for item in transcript.getchildren()[1:]: if item.tag == 'h2': # new major section last_h2 = clean_spaces(item.text_content()) section = last_h2 elif item.tag == 'h3': # new subsection section = last_h2 + ': ' + clean_spaces(item.text_content()) elif item.tag == 'p' and item.get('class') == 'speakerStart': # new speaker # 99% of the time there are two children, <a>, <strong> # sometimes there are more (looks like format errors), # so we warn for now children = item.getchildren() a = children[0] strong = children[1] if len(children) > 2: self.warning('found extra tags in speakerStart: %s', ', '.join(x.tag for x in children)) anchor = day_url + '#' + a.get('name') speaker = strong.text_content().rstrip(':') text = strong.tail sequence += 1 speech = Speech(session, chamber, 'floor-' + date, when, sequence, speaker, text, section=section) speech.add_source(anchor) elif item.tag == 'p' and item.get('class') == 'timeStamp': timestamp = item.text_content() when = when.replace(hour=int(timestamp[:-2]), minute=int(timestamp[-2:])) elif item.tag == 'p' and (item.get('class') == 'procedure' or item.get('class') == None and speech == None): # procedural action indicated by procedural tag or by # an empty tag with nobody speaking in prior session anchor = day_url + '#' + item.xpath('a')[0].get('name') sequence += 1 if item.text_content().strip() == '': continue speech = Speech(session, chamber, 'floor-' + date, when, sequence, '-fixme-', item.text_content(), section=section, type='procedure') speech.add_source(anchor) self.save_speech(speech) speech = None elif item.tag == 'p' and item.get('class') == None: if not item.text_content(): continue if len(item.getchildren()) > 1: self.warning('found extra tags in speakerStart: %s', ', '.join(x.tag for x in children)) speech['text'] += '\n\n' + item.text_content() self.save_speech(speech) elif item.tag == 'p': self.error('unknown p class=%s', item.get('class')) else: self.error('unexpected tag <%s>', item.tag)