if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('p.cfm\?i', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://harkin.senate.gov' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(152, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) date = soup2.findAll('h2') date = date[1] date = utilities.clean_html(str(date)) date = re.sub('\W', '', date) #stores='' abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() stores = soup2.findAll('p') stores = utilities.clean_html(str(stores)) stores = re.sub('\W', ' ', stores) names = str(num) + 'Harkin' + date + '.txt' files = open(names, 'w') files.write(stores) files.close()
ba = re.findall('\?id', str(ab)) if len(ba) > 0: a += 1 if a % 2 == 1: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://reed.senate.gov/newsroom/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores) mint = soup2.findAll('strong') mint = utilities.clean_html(str(mint[0])) mint = mint.split(' ') mons = mon_key[mint[0]] day = re.sub('\W', '', mint[1]) year = mint[-1] names = day + mons + year + 'Reed' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
soup = BeautifulSoup(out) res = soup.findAll('a') fr = [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://bingaman.senate.gov/news/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('p') date = utilities.clean_html(str(divs[2]).split('\n')[0]) date = re.sub('\W', '', date) stores = '' for b in range(len(divs)): de = utilities.clean_html(str(divs[b])) stores += de names = 'Bingaman' + str(num) + date + '.txt' files = open(names, 'w') files.write(stores) files.close()
ba = re.findall('\?id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://billnelson.senate.gov/news/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) h2s = soup2.findAll('h2') date = utilities.clean_html(str(h2s[0].findNext('p'))) for k in range(len(months)): att = re.findall(months[k], str(date)) if len(att)>0: mons = mon_key[months[k]] temp = date.split(' ') day = re.sub('\W', '', temp[1]) year = temp[-1] agg = day + mons + year abd= soup2.findAll('a') for k in range(len(abd)): abd[k].extract() stores = utilities.clean_html(str(soup2)) names = agg + 'BillNelson' + str(num) + '.txt' files = open(names, 'w') files.write(stores)
ba = re.findall('id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://brownback.senate.gov/pressapp/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('p') date = utilities.clean_html(str(divs[0])) date = re.sub(' ', '', date) date = date.split(',') mon_day = date[1] for k in range(len(months)): abc = re.findall(months[k], mon_day) if len(abc)>0: mons = mon_key[months[k]] day = re.sub(months[k], '', mon_day) year = re.sub('\W', '', date[-1]) stores = '' for b in range(len(divs)): de = utilities.clean_html(str(divs[b])) stores += de + ' '
out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr= [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('p.cfm\?i', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) date = [] spans = soup.findAll('span') for k in range(len(spans)): if spans[k].has_key('class'): if spans[k]['class']=='smaller': abc = utilities.clean_html(str(spans[k])) abc = abc.split('/') mons = month[abc[0]] day = abc[1] year = abc[-1] date.append(day + mons + year) store = '' for num in range(len(fr)): store += 'http://harkin.senate.gov' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test)
store = '' for num in range(len(fr)): store += 'http://feinstein.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(2, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('p') ted = soup2.findAll('td') for k in range(len(ted)): if ted[k].has_key('width'): if ted[k]['width']=='60%': tt = ted[k] almost = utilities.clean_html(str(tt)).split(':') eta = almost[-1] eta = re.sub('\W', '', eta) date = eta stores='' opts = soup2.findAll('option') for k in range(len(opts)): opts[k].extract() ast = soup2.findAll('a') for k in range(len(ast)): ast[k].extract() h3s = soup2.findAll('h3') for k in range(len(h3s)): h3s[k].extract() stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores)
store = '' for num in range(len(fr)): store += 'http://coleman.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(20, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('td') stores = '' date = '' for k in range(len(divs)): if divs[k].has_key('class') and divs[k].has_key('style'): if divs[k]['class'] == 'Text': stores += utilities.clean_html(str(divs[k])) aqw = divs[k].findChildren('strong') for m in range(len(aqw)): ester = re.findall('\d\d\d\d', str(aqw[m])) if len(ester) > 0: date = utilities.clean_html(str(aqw[m])) date = re.sub('\W', '', date) abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() tds = soup2.findAll('td') for k in range(len(tds)): if tds[k].has_key('class') and tds[k].has_key('width'): if tds[k]['class'] == 'Text' and tds[k]['width'] == '99%': tds[k].extract() stores = utilities.clean_html(str(soup2))
ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://domenici.senate.gov/news/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('p') a = 0 for k in range(len(ps)): if ps[k].has_key('class'): a += 1 if a == 2: date = utilities.clean_html(str(ps[k])) date = utilities.clean_html(str(date)) date = re.sub('\W', '', date) stores = '' for m in range(len(ps)): stores += utilities.clean_html(str(ps[m])) stores = re.sub('\W', ' ', stores) names = str(num) + 'Domenici' + date + '.txt' files = open(names, 'w') files.write(stores) files.close()
store = '' for num in range(len(fr)): store += 'http://ensign.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('div') date = soup2.findAll('span') for k in range(len(date)): if date[k].has_key('class') and date[k].has_key('alt') and date[k].has_key('title'): if date[k]['class']=='pressappReleaseBody' and date[k]['alt']=='Release Date': mint = date[k] mint = utilities.clean_html(str(mint)) mint = mint.split(' ') mons = mon_key[mint[1]] days = re.sub('\W', '', mint[2]) year = mint[-1] stores='' for m in range(len(ps)): if ps[m].has_key('class') and ps[m].has_key('alt') and ps[m].has_key('title'): if ps[m]['class']=='pressappReleaseBody' and ps[m]['title']=='Release Body': stores += utilities.clean_html(str(ps[m])) stores = re.sub('\W', ' ', stores) names = days + mons + year + 'Ensign' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
store = '' for num in range(len(fr)): store += 'http://feinstein.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(2, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('p') ted = soup2.findAll('td') for k in range(len(ted)): if ted[k].has_key('width'): if ted[k]['width'] == '60%': tt = ted[k] almost = utilities.clean_html(str(tt)).split(':') eta = almost[-1] eta = re.sub('\W', '', eta) date = eta stores = '' opts = soup2.findAll('option') for k in range(len(opts)): opts[k].extract() ast = soup2.findAll('a') for k in range(len(ast)): ast[k].extract() h3s = soup2.findAll('h3') for k in range(len(h3s)): h3s[k].extract() stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores)
fr= [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('/~feingold/releases', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://feingold.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(535, len(fr)): test = urlopen(fr[num]).read() fret = re.findall('\<dpc\s.+', str(test)) fret2 = fret[0].split('=') fret3 = fret2[-1] date = re.sub('\W', '', fret3) soup2 = BeautifulSoup(test) ps = soup2.findAll('p') stores='' for m in range(0, len(ps)): stores += utilities.clean_html(str(ps[m])) + ' ' stores = re.sub('\W', ' ', stores) names = str(num) + 'Feingold'+ date + '.txt' files = open(names, 'w') files.write(stores) files.close()
fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://gregg.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(1,len(fr)-1): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('p') h1 = soup2.findAll('h1') date = soup2.findAll('h4') date = utilities.clean_html(str(date[0])) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[-1] stores='' for k in range(len(h1)): if h1[k].has_key('class')==False: stores += utilities.clean_html(str(h1[k])) + ' ' for m in range(len(ps)): stores += utilities.clean_html(str(ps[m])) + ' ' stores = re.sub('\W', ' ', stores) names = day + mons + year + 'Gregg' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://coburn.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') ##the problem is you need to grab text that is just ##sitting in the middle of the page for num in range(1, len(fr) - 1): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) date = soup2.findAll('h4') date = utilities.clean_html(str(date[0])) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[-1] abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() opt = soup2.findAll('option') for k in range(len(opt)): opt[k].extract() h1s = soup2.findAll('h1') for k in range(len(h1s)): h1s[k].extract() h3s = soup2.findAll('h3') for k in range(len(h3s)):
if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://biden.senate.gov/newsroom/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(128, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('p') date = utilities.clean_html(str(divs[0]).split('\n')[0]) date = re.sub('\W', '', date) stores = '' for b in range(len(divs)): de = utilities.clean_html(str(divs[b])) stores += de names = 'Biden' + str(num) + date + '.txt' files = open(names, 'w') files.write(stores) files.close()
if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://dole.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(2, 3): #len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('td') date = soup2.findAll('strong') for k in range(len(date)): if date[k].has_key('class'): if date[k]['class'] == 'recorddate': out = utilities.clean_html(str(date[k])) stores = '' for m in range(len(ps)): if ps[m].has_key('class') and ps[m].has_key('colspan'): if ps[m]['class'] == 'text': stores += utilities.clean_html(str(ps[m])) stores = re.sub('\W', ' ', stores) out = re.sub('\W', '', out) names = 'Dole' + str(num) + out + '.txt' files = open(names, 'w') files.write(stores) files.close()
fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://burr.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(2, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) tempd = soup2.findAll('strong') for k in range(len(tempd)): if tempd[k].has_key('class'): if tempd[k]['class'] == 'recorddate': date = utilities.clean_html(str(tempd[k])) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[-1] tds = soup2.findAll('td') its = [] for m in range(len(tds)): if tds[m].has_key('class'): its.append(m) for k in its: if tds[(k - 1)].has_key('align') and tds[k]['class'] == 'vblack11': if tds[(k - 1)]['align'] == 'center' and tds[( k - 1)]['class'] == 'Text': stores = utilities.clean_html(str(tds[k])) names = day + mons + year + 'Burr' + str(num) + '.txt'
for j in range(0, len(html)): out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr= [] date = [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('PressReleases', str(ab)) if len(ba)>0: fr.append(res[k]['href']) awe= soup.findAll('h3') for k in range(len(awe)): ester = awe[k] ester = utilities.clean_html(str(ester)) ester = re.sub('\W', '', ester) date.append(ester) store = '' for num in range(len(fr)): store += 'http://warner.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd= soup2.findAll('a') for k in range(len(abd)):
ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://brownback.senate.gov/pressapp/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('p') date = utilities.clean_html(str(divs[0])) date = re.sub(' ', '', date) date = date.split(',') mon_day = date[1] for k in range(len(months)): abc = re.findall(months[k], mon_day) if len(abc) > 0: mons = mon_key[months[k]] day = re.sub(months[k], '', mon_day) year = re.sub('\W', '', date[-1]) stores = '' for b in range(len(divs)): de = utilities.clean_html(str(divs[b])) stores += de + ' '
for num in range(len(fr)): store += 'http://coleman.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(20, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('td') stores ='' date ='' for k in range(len(divs)): if divs[k].has_key('class') and divs[k].has_key('style'): if divs[k]['class']=='Text': stores += utilities.clean_html(str(divs[k])) aqw = divs[k].findChildren('strong') for m in range(len(aqw)): ester = re.findall('\d\d\d\d', str(aqw[m])) if len(ester)>0: date = utilities.clean_html(str(aqw[m])) date = re.sub('\W', '', date) abd= soup2.findAll('a') for k in range(len(abd)): abd[k].extract() tds = soup2.findAll('td') for k in range(len(tds)): if tds[k].has_key('class') and tds[k].has_key('width'): if tds[k]['class']=='Text' and tds[k]['width']=='99%': tds[k].extract() stores = utilities.clean_html(str(soup2))
soup = BeautifulSoup(out) res = soup.findAll('a') fr= [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('\?releaseId', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) date = [] res = soup.findAll('span') for k in range(len(res)): if res[k].has_key('style'): if res[k]['style']=='font-size:10px': abc = utilities.clean_html(str(res[k])) abc = abc.split('/') mons = month[abc[0]] day = abc[1] year = '20' + abc[2] date.append(day + mons + year) store = '' for num in range(len(fr)): store += 'http://durbin.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('')
soup = BeautifulSoup(out) res = soup.findAll('a') fr = [] for k in range(len(res)): if res[k].has_key('href'): if res[k]['href']: ab = res[k]['href'] espn = re.findall('press/release', ab) if len(espn)>0: fr.append(ab.encode('UTF-8')) date = [] tds = soup.findAll('td') for k in range(len(tds)): if tds[k].has_key('class'): if tds[k]['class']=='date': temps = utilities.clean_html(str(tds[k])) temps = temps.split('.') mons = temps[0] mons = re.sub('\W', '', mons) mons = month[mons] day = temps[1] year = temps[-1] year = re.sub('\W', '', year) year = '20' + year date.append(day + mons + year) for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2= BeautifulSoup(test) h2s = soup2.findAll('h2') ps = soup2.findAll('p')
store = '' for num in range(len(fr)): store += 'http://dole.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(2,3):#len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('td') date = soup2.findAll('strong') for k in range(len(date)): if date[k].has_key('class'): if date[k]['class']=='recorddate' : out = utilities.clean_html(str(date[k])) stores= '' for m in range(len(ps)): if ps[m].has_key('class') and ps[m].has_key('colspan'): if ps[m]['class']=='text': stores += utilities.clean_html(str(ps[m])) stores = re.sub('\W', ' ', stores) out = re.sub('\W', '', out) names = 'Dole' + str(num) + out + '.txt' files = open(names, 'w') files.write(stores) files.close()
out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll("a") fr = [] for k in range(len(res)): if res[k].has_key("href"): ab = res[k]["href"] ba = re.findall("/releases/", str(ab)) if len(ba) > 0: fr.append(ab.encode("UTF-8")) temp = soup.findAll("dd") date = "" for k in range(len(temp)): ab = re.findall("/releases/", str(temp[k])) if len(ab) > 0: fudge = utilities.clean_html(str(temp[k])) fudge = fudge.split("-") fudge2 = re.sub("\W", "", str(fudge[-1])) date += fudge2 + "\n" store = "" for num in range(len(fr)): store += fr[num] + "\n" fr = store.split("\n") date = date.split("\n") date.remove("") fr.remove("") for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test)
for j in range(0, len(html)): out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr = [] date = [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('PressReleases', str(ab)) if len(ba) > 0: fr.append(res[k]['href']) awe = soup.findAll('h3') for k in range(len(awe)): ester = awe[k] ester = utilities.clean_html(str(ester)) ester = re.sub('\W', '', ester) date.append(ester) store = '' for num in range(len(fr)): store += 'http://warner.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract()
res = soup.findAll('a') fr= [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('\?id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) date = [] a = 0 ps = soup.findAll('strong') for m in range(len(ps)): a+=1 if a<len(fr)+1: date.append(utilities.clean_html(str(ps[m]))) store = '' for num in range(len(fr)): store += fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd= soup2.findAll('a') for k in range(len(abd)): abd[k].extract() stores = utilities.clean_html(str(soup2))
ab = res[k]["href"] ab = ab.strip("..") ba = re.findall("&ID", str(ab)) if len(ba) > 0: fr.append(ab.encode("UTF-8")) store = "" for num in range(len(fr)): store += "http://vitter.senate.gov/" + fr[num] + "\n" fr = store.split("\n") fr.remove("") for num in range(182, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) stow = soup2.findAll("span") for m in range(len(stow)): if stow[m].has_key("class"): if stow[m]["class"] == "PressReleaseItemDate": mint = utilities.clean_html(str(stow[m])) stores = "" p = soup2.findAll("p") for k in range(1, len(p) - 1): stores += utilities.clean_html(str(p[k])) stores = re.sub("\W", " ", stores) mint = re.sub("\W", "", mint) names = str(num) + "Vitter" + mint + ".txt" files = open(names, "w") files.write(stores) files.close()
if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://clinton.senate.gov/news/statements/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('p') date = utilities.clean_html(str(ps[0])) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[2] text = '' for k in range(len(ps)): text += utilities.clean_html(str(ps[k])) + ' ' stores = re.sub('\W', ' ' , text) names = day + mons + year + 'Clinton' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://boxer.senate.gov/news/releases/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('div') temp = soup2.findAll('span') for m in range(len(temp)): if temp[m].has_key('class'): if temp[m]['class'] == 'pressappReleaseBody': date = utilities.clean_html(str(temp[m])) date = date.split(',') year = re.sub('\W', '', date[-1]) mons_day = date[1].split(' ') mons = mon_key[mons_day[1]] day = mons_day[-1] for k in range(len(divs)): if divs[k].has_key('class'): if divs[k]['class'] == 'pressappReleaseBody': stores = utilities.clean_html(str(divs[k])) names = day + mons + year + 'Boxer' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://burr.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(2, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) tempd = soup2.findAll('strong') for k in range(len(tempd)): if tempd[k].has_key('class'): if tempd[k]['class']=='recorddate': date = utilities.clean_html(str(tempd[k])) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[-1] tds = soup2.findAll('td') its =[] for m in range(len(tds)): if tds[m].has_key('class'): its.append(m) for k in its: if tds[(k-1)].has_key('align') and tds[k]['class']=='vblack11': if tds[(k-1)]['align']=='center' and tds[(k-1)]['class']=='Text': stores = utilities.clean_html(str(tds[k])) names = day + mons + year + 'Burr' + str(num) + '.txt' files = open(names, 'w')
for j in range(0, len(html)): out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr = [] date = [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('_id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) det = soup.findAll('h3') for k in range(len(det)): dtt = utilities.clean_html(str(det[k])) dtt = dtt.split('/') mons = month[dtt[0]] day = dtt[1] year = '20' + dtt[2] almost = day + mons + year date.append(almost) store = '' for num in range(len(fr)): store += 'http://barrasso.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read()
ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://bennett.senate.gov/press/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('p') start = soup2.findAll('title') date = utilities.clean_html(str( start[0])).split(':')[-1].strip(' ').split('/') mons = month[date[0]] day = date[1] year = date[2] agg = day + mons + year stores = '' for b in range(len(divs)): de = utilities.clean_html(str(divs[b])) stores += de names = agg + 'Bennett' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
store ='' for num in range(len(fr)): if fr[num][0:2]=='/p': store += 'http://cochran.senate.gov' + fr[num] + '\n' elif fr[num][0:2] == 'pr': store += 'http://cochran.senate.gov/' + fr[num] + '\n' else: store += fr[num] + '\n' fr= store.split('\n') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) stores = '' mint = date[num] mint = re.sub('\W', '', mint) abd= soup2.findAll('a') for k in range(len(abd)): abd[k].extract() stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores) names = 'Cochran' + str(num) + mint + '.txt' files= open(names, 'w') files.write(stores) files.close()
store = '' for num in range(len(fr)): store += 'http://hatch.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, 1): #len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('td') for k in range(len(ps)): if ps[k].has_key('class') and ps[k].has_key('nowrap'): if ps[k]['class'] == 'vblack10': emmit = re.findall('\s\d\d\d\d', str(ps[k])) if len(emmit) == 1: out = utilities.clean_html(str(ps[k])) pass date = re.sub('\W', '', out) stores = '' abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() opt = soup2.findAll('option') for k in range(len(opt)): opt[k].extract() strongs = soup2.findAll('strong') for k in range(len(strongs)): strongs[k].extract() tds = soup2.findAll('td') for j in range(len(tds)): if tds[j].has_key('width'):
ab = ab.strip('..') ba = re.findall('\?id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://billnelson.senate.gov/news/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) h2s = soup2.findAll('h2') date = utilities.clean_html(str(h2s[0].findNext('p'))) for k in range(len(months)): att = re.findall(months[k], str(date)) if len(att) > 0: mons = mon_key[months[k]] temp = date.split(' ') day = re.sub('\W', '', temp[1]) year = temp[-1] agg = day + mons + year abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() stores = utilities.clean_html(str(soup2)) names = agg + 'BillNelson' + str(num) + '.txt' files = open(names, 'w') files.write(stores)
soup = BeautifulSoup(out) res = soup.findAll('a') fr= [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('\?id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) ps = soup.findAll('p') date= [] for m in range(len(ps)): if ps[m].has_key('class'): if ps[m]['class']=='newsDate': ab = utilities.clean_html(str(ps[m])) if j ==0: ab += ' ' + '2007' if j==1: ab += ' ' + '2006' if j==2: ab +=' ' + '2005' if j==3: ab += ' ' +'2004' if j==4: ab+= ' ' +'2003' if j ==5: ab+= ' ' +'2002' if j==6: ab+= ' ' + '2001' test = ps[m].fetchNextSiblings('blockquote')
if fr[num][0]=='h': store += fr[num] + '\n' else: store += 'http://thune.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(1,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) stow= soup2.findAll('strong') for m in range(len(stow)): if stow[m].has_key('class'): if stow[m]['class']=='recorddate': mint = utilities.clean_html(str(stow[m])) mint = mint.split(' ') mons = mon_key[mint[0]] day = re.sub('\W', '', mint[1]) day = re.sub('[a-z]+', '', day) years = mint[-1] abd= soup2.findAll('a') for k in range(len(abd)): abd[k].extract() tables = soup2.findAll('table') for k in range(len(tables)): if tables[k].has_key('width') and tables[k].has_key('border') and tables[k].has_key('bordercolor') and tables[k].has_key('cellspacing') and tables[k].has_key('cellpadding'): if tables[k]['width']=='100%' and tables[k]['border']=='0' and tables[k]['bordercolor']=='orange' and tables[k]['cellspacing']=='0' and tables[k]['cellpadding']=='0': tables[k].extract() stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores)
store += 'http://wyden.senate.gov' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd= soup2.findAll('a') for k in range(len(abd)): abd[k].extract() act = soup2.findAll('h3') for k in range(len(act)): act[k].extract() span = soup2.findAll('span') for j in range(len(span)): if span[j].has_key('class') and span[j].has_key('alt') and span[j].has_key('title'): if span[j]['class']=='pressappReleaseBody' and span[j]['alt']=='Release Date' and span[j]['title']=='Release Date' : date = utilities.clean_html(str(span[j])) stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores) mint = date date= date.split(" ") mons = date[1] day = re.sub('\W', '', date[2]) year = date[-1] names = day + mons + year + 'Wyden' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
ab = ab.strip('..') ba = re.findall('\?id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(2, len(fr)): store += 'http://baucus.senate.gov/newsroom/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) h2s = soup2.find('h2') date = h2s.findNext('p') date = utilities.clean_html(str(date)) date = date.split(' ') mons = mon_key[date[0]] days = re.sub('\W', '', date[1]) year = date[2] stores = utilities.clean_html(str(soup2)) names = days + mons + year + 'Baucus' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
store = '' for num in range(len(fr)): store += 'http://hatch.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,1):#len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('td') for k in range(len(ps)): if ps[k].has_key('class') and ps[k].has_key('nowrap'): if ps[k]['class']=='vblack10': emmit = re.findall('\s\d\d\d\d', str(ps[k])) if len(emmit)==1: out = utilities.clean_html(str(ps[k])) pass date = re.sub('\W', '', out) stores='' abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() opt = soup2.findAll('option') for k in range(len(opt)): opt[k].extract() strongs = soup2.findAll('strong') for k in range(len(strongs)): strongs[k].extract() tds = soup2.findAll('td') for j in range(len(tds)): if tds[j].has_key('width'):
res = soup.findAll('a') fr= [] mons = [] days = [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) mons.append(mon_key[j]) h3s = soup.findAll('h3') for m in range(len(h3s)): if h3s[m].has_key('class'): if h3s[m]['class']=='ContentGrid': abc = utilities.clean_html(str(h3s[m])) abc = re.sub('[a-z][a-z]', '', str(abc)) abc = re.sub('\W', '', abc) days.append(abc) store = '' for num in range(len(fr)): store += 'http://wicker.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read()
ba = re.findall('id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://conrad.senate.gov/pressroom/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('p') date = ps[2] date = utilities.clean_html(str(date)) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[-1] stores='' h2s = soup2.findAll('h2') h3s = soup2.findAll('h3') stores += utilities.clean_html(str(h2s[1])).strip(' ').strip('\n') + ' ' stores += utilities.clean_html(str(h3s[0])).strip(' ').strip('\n').strip('\r') + ' ' for m in range(len(ps)): if ps[m].has_key('style')==False : stores += utilities.clean_html(str(ps[m])) + ' ' stores = re.sub('\W', ' ', stores) names = day + mons + year + 'Conrad' + str(num) + '.txt' files = open(names, 'w')
out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr = [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('/releases/', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) temp = soup.findAll('dd') date = '' for k in range(len(temp)): ab = re.findall('/releases/', str(temp[k])) if len(ab) > 0: fudge = utilities.clean_html(str(temp[k])) fudge = fudge.split('-') fudge2 = re.sub('\W', '', str(fudge[-1])) date += fudge2 + '\n' store = '' for num in range(len(fr)): store += fr[num] + '\n' fr = store.split('\n') date = date.split('\n') date.remove('') fr.remove('') for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test)
ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://www.senate.gov/~chambliss/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(3, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) date = soup2.findAll('h4') date = utilities.clean_html(str(date[0])) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[-1] divs = soup2.findAll('div') text = '' if j >= 2: for m in range(0, len(divs)): text += utilities.clean_html(str(divs[m])) if j < 2: for m in range(3, len(divs)): text += utilities.clean_html(str(divs[m])) stores = re.sub('\W', ' ', text) names = day + mons + year + 'Chambliss' + str(num) + '.txt' files = open(names, 'w')
ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://bond.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') fr = fr[3:] for num in range(len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) divs = soup2.findAll('div') date = soup2.findAll('h4') date = utilities.clean_html(str(date[0])) date = date.split(' ') mons = mon_key[date[0]] day = re.sub('\W', '', date[1]) year = date[-1] for k in range(len(divs)): if divs[k].has_key('id'): if divs[k]['id'] == 'contentRecord': store = utilities.clean_html(str(divs[k])) names = day + mons + year + 'Bond' + str(num) + '.txt' files = open(names, 'w') files.write(store) files.close()
if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('\?id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) ps = soup.findAll('span') a = 0 date = [] for m in range(len(ps)): if ps[m].has_key('class'): if ps[m]['class'] == 'pressappSmallText': a += 1 if a > 2: abc = utilities.clean_html(str(ps[m])) abc = abc.split('/') mons = month[abc[0]] day = abc[1] year = '20' + abc[-1] date.append(day + mons + year) store = '' for num in range(len(fr)): store += 'http://lugar.senate.gov/press/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test)
store = '' for num in range(len(fr)): store += 'http://kohl.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() hope = re.findall('\<dpc\sdate\=[A-Z][a-z]+\s\d+\,\s\d\d\d\d', test) if len(hope) == 0: hope = re.findall('\<dpc\sdate\=\s[A-Z]+\s\d\d\,\s\d\d\d\d', test) if len(hope) == 0: hope = re.findall('\<dpc\sdate\=\d\d\-\d\d\-\d\d\d\d', test) if len(hope) > 0: mint = hope[0].split('=')[-1] mint = mint.split(' ') mons = mon_key[mint[0]] day = re.sub('\W', '', mint[1]) year = mint[-1] stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores) names = day + mons + year + 'Kohl' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('\?id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) date = [] ps = soup.findAll('td') a = 0 for m in range(len(ps)): if ps[m].has_key('bgcolor'): if ps[m]['bgcolor'] == 'ffffff' or ps[m]['bgcolor'] == 'efefef': a += 1 if a % 2 == 1: date.append(utilities.clean_html(str(ps[m]))) store = '' for num in range(len(fr)): store += 'http://tester.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() span = soup2.findAll('span') for k in range(len(span)):
store += 'http://ensign.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('div') date = soup2.findAll('span') for k in range(len(date)): if date[k].has_key('class') and date[k].has_key( 'alt') and date[k].has_key('title'): if date[k]['class'] == 'pressappReleaseBody' and date[k][ 'alt'] == 'Release Date': mint = date[k] mint = utilities.clean_html(str(mint)) mint = mint.split(' ') mons = mon_key[mint[1]] days = re.sub('\W', '', mint[2]) year = mint[-1] stores = '' for m in range(len(ps)): if ps[m].has_key('class') and ps[m].has_key( 'alt') and ps[m].has_key('title'): if ps[m]['class'] == 'pressappReleaseBody' and ps[m][ 'title'] == 'Release Body': stores += utilities.clean_html(str(ps[m])) stores = re.sub('\W', ' ', stores) names = days + mons + year + 'Ensign' + str(num) + '.txt' files = open(names, 'w') files.write(stores)
out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr = [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) spans = soup.findAll('td') date = [] for k in range(len(spans)): if spans[k].has_key('class'): if spans[k]['class'] == 'relcelldate': damp = utilities.clean_html(str(spans[k])) damp = damp.split('/') mons = month[damp[0]] day = damp[1] year = damp[-1] date.append(day + mons + year) store = '' for num in range(len(fr)): store += 'http://carper.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(1, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test)
for j in range(0, len(html)): out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr = [] hs = soup.findAll('h3') for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('id', str(ab)) if len(ba) > 0: fr.append(ab.encode('UTF-8')) for m in range(len(hs)): if hs[m].has_key('class') and hs[m].has_key('style'): if hs[m]['class'] == 'ContentGrid': date = utilities.clean_html(str(hs)) store = '' for num in range(len(fr)): store += 'http://lgraham.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') date = date.strip('[').strip(']') date = date.split(',') fr.remove('') ##so we can process the pages as we move along for num in range(1, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ps = soup2.findAll('a') for m in range(len(ps)):
for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('\?id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) date=[] ##dates = soup.findAll('td') dates = soup.findAll('strong') ##for m in range(1,len(dates)-1): ## if dates[m].has_key('height'): ## if dates[m]['height']=='14': ## date.append(utilities.clean_html(str(dates[m]))) for m in range(1,len(dates)-1): abc = utilities.clean_html(str(dates[m])) abc = abc.split('/') mons = month[abc[0]] day = abc[1] year = '20' + abc[2] date.append(day + mons + year) store = '' for num in range(len(fr)): store += 'http://kerry.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)):
for j in range(0,len(html)): out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr= [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ba = re.findall('\?id', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) date=[] dates = soup.findAll('dt') for m in range(len(dates)): abc = utilities.clean_html(str(dates[m])) abc = abc.split('/') mons = month[abc[0]] day = abc[1] year = '20' + abc[2] date.append(day + mons + year) store = '' for num in range(len(fr)): store += 'http://kennedy.senate.gov/newsroom/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)):
ab = ab.strip('..') ba = re.findall('&ID', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) store = '' for num in range(len(fr)): store += 'http://vitter.senate.gov/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(182,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) stow= soup2.findAll('span') for m in range(len(stow)): if stow[m].has_key('class'): if stow[m]['class']=='PressReleaseItemDate': mint = utilities.clean_html(str(stow[m])) stores = '' p = soup2.findAll('p') for k in range(1,len(p)-1): stores += utilities.clean_html(str(p[k])) stores = re.sub('\W', ' ', stores) mint = re.sub('\W', '', mint) names = str(num) + 'Vitter' + mint + '.txt' files = open(names, 'w') files.write(stores) files.close()
fr.remove('') for num in range(0, len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) abd = soup2.findAll('a') for k in range(len(abd)): abd[k].extract() act = soup2.findAll('h3') for k in range(len(act)): act[k].extract() span = soup2.findAll('span') for j in range(len(span)): if span[j].has_key('class') and span[j].has_key( 'alt') and span[j].has_key('title'): if span[j]['class'] == 'pressappReleaseBody' and span[j][ 'alt'] == 'Release Date' and span[j][ 'title'] == 'Release Date': date = utilities.clean_html(str(span[j])) stores = utilities.clean_html(str(soup2)) stores = re.sub('\W', ' ', stores) mint = date date = date.split(" ") mons = date[1] day = re.sub('\W', '', date[2]) year = date[-1] names = day + mons + year + 'Wyden' + str(num) + '.txt' files = open(names, 'w') files.write(stores) files.close()
soup = BeautifulSoup(out) res = soup.findAll('a') fr= [] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('release', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) date = [] ps = soup.findAll('p') for k in range(len(ps)): if ps[k].has_key('class'): if ps[k]['class']=='press_date': temp = utilities.clean_html(str(ps[k])).split('.') mons = month[temp[0]] day = temp[1] year= '20' + temp[2] agg = day + mons + year date.append(agg) for num in range(1, len(fr)): test = urlopen(fr[num]).read() soup2= BeautifulSoup(test) h2s = soup2.findAll('h2') ps = soup2.findAll('p') h2s = utilities.clean_html(str(h2s[0])) store = '' store += h2s + ' ' for k in range(len(ps)):
store += 'http://grassley.senate.gov/public/' + fr[num] + '\n' fr = store.split('\n') fr.remove('') for num in range(0,len(fr)): test = urlopen(fr[num]).read() soup2 = BeautifulSoup(test) ted = soup2.findAll('td') a = 0 stores ='' for k in range(len(ted)): if ted[k].has_key('class'): if ted[k]['class']=='text': att = re.findall('\d\d\d\d', str(ted[k])) if len(att)>0: date = utilities.clean_html(str(ted[k])) for k in range(len(ted)): if ted[k].has_key('class') and ted[k].has_key('style'): if ted[k]['class']=='Text': stores += utilities.clean_html(str(ted[k])) if ted[k].has_key('class'): if ted[k]['class']=='recordtitle': stores += utilities.clean_html(str(ted[k])) date = date.split(' ') mons = mon_key[date[0]] day = re.findall('[0-9]+', date[1])[0] year = date[-1] stores = re.sub('\W', ' ', stores) names = day + mons + year + 'Grassley' + str(num) + '.txt' files = open(names, 'w') files.write(stores)