def add_house_votes(self, vote, filename): vcount_re = re.compile('AYES.* (\d+).*NAYS.* (\d+).*NOT VOTING.* (\d+).* PAIRED.*(\d+)') xml = convert_pdf(filename, 'xml') doc = lxml.html.fromstring(xml) # use lxml.html for text_content() # function to call on next legislator name vfunc = None name = '' for textitem in doc.xpath('//text/text()'): if textitem.startswith('AYES'): ayes, nays, nv, paired = vcount_re.match(textitem).groups() vote['yes_count'] = int(ayes) vote['no_count'] = int(nays) vote['other_count'] = int(nv)+int(paired) elif textitem == 'N': vfunc = vote.no name = '' elif textitem == 'Y': vfunc = vote.yes name = '' elif textitem == 'x': vfunc = vote.other name = '' elif textitem in ('R', 'D', 'I'): vfunc(name) else: if name: name += ' ' + textitem else: name = textitem
def scrape_votes(self, url, motion, date, chamber): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, 'text') text = text.replace("Yeas--", ",Yeas, ") text = text.replace("Nays--", ",Nays, ") text = text.replace("Total--", ",Total, ") text = text.replace("DISCLAIMER", ",DISCLAIMER,") text = text.replace("--", ",") text = text.replace("Absent or those not voting", ",Absentorthosenotvoting,") passed = text.find("passed") != -1 split_text = text.split(",") yea_mark = split_text.index("Yeas") + 1 end_mark = split_text.index("DISCLAIMER") nays, other = False, False yes_votes = [] no_votes = [] other_votes = [] for num in range(yea_mark, end_mark): name = split_text[num] name = name.replace("\n", "") if name.find("(") != -1: if len(name.split()) == 2: name = name.split()[0] if len(name.split()) == 3: name = name.split()[0] + " " + name.split()[1] if len(name) > 0 and name[0] == " ": name = name[1: len(name)] if len(name.split()) > 3: name = name.replace(" ", "") if self.check_name(name, nays, other) == 1: yes_votes.append(name) elif self.check_name(name, nays, other) == 2: no_votes.append(name) elif self.check_name(name, nays, other) == 3: other_votes.append(name) else: if name == "Nays": nays = True if name.find("Absent") != -1: nays = False other = True yes_count = len(yes_votes) no_count = len(no_votes) other_count = len(other_votes) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['yes_votes'] = yes_votes vote['no_votes'] = no_votes vote['other_votes'] = other_votes return vote
def add_senate_votes(self, vote, filename): xml = convert_pdf(filename, 'xml') doc = lxml.html.fromstring(xml) # use lxml.html for text_content() # what to do with the pieces vfunc = None for textitem in doc.xpath('//text'): text = textitem.text_content().strip() if text.startswith('AYES'): vfunc = vote.yes vote['yes_count'] = int(text.split(u' \u2212 ')[1]) elif text.startswith('NAYS'): vfunc = vote.no vote['no_count'] = int(text.split(u' \u2212 ')[1]) elif text.startswith('NOT VOTING'): vfunc = vote.other vote['other_count'] = int(text.split(u' \u2212 ')[1]) elif text.startswith('SEQUENCE NO'): vfunc = None elif vfunc: vfunc(text)
def scrape_votes(self, url, motion, date, chamber): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, 'text') # this way we get a key error on a missing vote type #if motion in self._vote_mapping: motion, passed = self._vote_mapping[motion] #else: # passed = True # self.warning('unknown vote type: ' + motion) # process PDF text yes_votes = [] no_votes = [] other_votes = [] # point at array to add names to cur_array = None precursors = ( ('Yeas--', yes_votes), ('Nays--', no_votes), ('Absent or those not voting--', other_votes), ('Absent and those not voting--', other_votes), ('Voting Present--', other_votes), ('Present--', other_votes), ('DISCLAIMER', None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.split('\n')) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line: cur_array = arr line = line.replace(pc, '') # split names for name in line.split(','): name = name.strip() # None or a Total indicate the end of a section if 'None.' in name: cur_array = None match = re.match(r'(.+?)\. Total--.*', name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok if cur_array is not None and name and 'Total--' not in name: # strip trailing . if name[-1] == '.': name = name[:-1] cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) other_count = len(other_votes) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['yes_votes'] = yes_votes vote['no_votes'] = no_votes vote['other_votes'] = other_votes return vote