-
Notifications
You must be signed in to change notification settings - Fork 0
/
lastwords.py
67 lines (57 loc) · 2.15 KB
/
lastwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from page_object import Page
import pandas as pd
import time
# The idea here will be to scrape the information from
# http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html
# which is Texas state records on last words of those executed by
# the state fince 1984. Note that there were 2 last month.
#
# Oh Texas.
# I'll be using Selenium webdriver and my Page Object to run through
# and pick up all the necessary text. The first iteration will likely
# not run super fast as each person's last words and conviction information
# are on separate pages. So if there are N people, we have at least 2N
# pages to go through and scrape from. Should be fun.
# For the moment though, I'm mostly just worried about efficiently
# scraping the text from the table. That is proving a challenge
def scrape(table_el):
'''
Takes in table_el, a predetermined selenuim table element,
returns an array containing text from table.
'''
tempArray = []
tempRow = []
for i in range(2, 5):
tempRow = []
for j in range(1, colCount + 1):
toAppend = table_el.find_element_by_xpath('./tbody/tr[' + str(i) + ']/td[' + str(j) + ']').text
tempRow.append(toAppend)
tempArray.append(tempRow)
print 'row appended'
return tempArray
def get_col(table_el):
'''
Takes in table_el, a predetermined selenuim table element,
returns list of text in first row.
Makes assumption first row is column headers.
'''
columns = []
for i in range(colCount):
i += 1
colName = table_el.find_element_by_xpath('./tbody/tr[1]/th[' + str(i) + ']').text
if colName in columns:
columns.append(colName + str(1))
else:
columns.append(colName)
return columns
if __name__ == '__main__':
page = Page('http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html')
table_el = page.find_element_by_locator('class name', 'os')
rowCount = len(table_el.find_elements_by_xpath('./tbody/tr'))
colCount = len(table_el.find_elements_by_xpath('./tbody/tr[2]/td'))
column_heads = get_col(table_el)
a = scrape(table_el)
dF = pd.DataFrame(a, columns = column_heads)