/
corp_search.py
115 lines (94 loc) · 3.93 KB
/
corp_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from selenium import webdriver
from selenium.webdriver.support import ui
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import psycopg2
import re
import string
from datetime import datetime
url = "http://corp.sec.state.ma.us/CorpWeb/CorpSearch/CorpSearch.aspx"
driver = webdriver.Firefox()
wait = ui.WebDriverWait(driver, 120)
db = psycopg2.connect("dbname=corp_search user=corp_search password=corp_search")
cur = db.cursor()
log = open('error.log', 'a')
def getSearchResults(searchString):
driver.get(url)
first_set = True
finished = False
driver.find_element_by_id("MainContent_rdoByEntityName").click()
search_field = driver.find_element_by_id("MainContent_txtEntityName")
search_field.send_keys(searchString)
driver.find_element_by_id("MainContent_btnSearch").click()
try:
wait.until( EC.presence_of_element_located((By.ID, "MainContent_UpdatePanelGrid")))
except:
print "problem loading results for " + searchString
return
next_page = 2
#loop through pagesets (separated by ... links)
while not finished:
#first page of set
table = driver.find_element_by_id("MainContent_UpdatePanelGrid")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
f = open('test.txt', 'w')
f.write(str(soup))
f.close()
processResultsPage(soup)
#loop through pages within set
while soup.find('a', text=str(next_page)):
next_link = driver.find_element_by_link_text(str(next_page))
next_link.click()
wait.until(EC.staleness_of(next_link))
table = driver.find_element_by_id("MainContent_UpdatePanelGrid")
next_page = next_page + 1
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
processResultsPage(soup)
next_links = soup.findAll('a', text="...")
#after first 20, only one '...' link
if first_set and len(next_links) == 1:
next_link = driver.find_element_by_link_text("...")
first_set = False
elif len(next_links) == 2:
next_link = driver.find_elements_by_link_text("...")[1]
else: #no continuation links
finished = True
continue
prev_link = driver.find_element_by_link_text(str(next_page - 2))
next_link.click()
#instead of waiting for link to phase out, wait for new set of links
next_page = next_page + 1
wait.until(EC.staleness_of(prev_link))
def processResultsPage(soup):
rows = soup.findAll('tr', class_=re.compile("Grid(Alt)?Row"))
for row in rows:
ID = unicode(row.findAll('td')[1].string)
name = unicode(row.find('a').string).encode('ascii', 'xmlcharrefreplace')
profile_url = row.find('a')['href']
address1 = unicode(row.findAll('td')[3].contents[0]).encode('ascii', 'xmlcharrefreplace')
try:
address2 = unicode(row.findAll('td')[3].contents[2]).encode('ascii', 'xmlcharrefreplace')
except IndexError:
address2 = ""
address = address1 + "\n" + address2
try:
cur.execute("INSERT INTO corp_name (ID, name, address, profile_url) \
VALUES (%s, %s, %s, %s)", (ID, name, address, profile_url))
except psycopg2.IntegrityError as e:
log.write("\tproblem with entry\n\tID: " + ID + "\n\tname: " + name + "\n")
log.write("\t" + e.pgerror)
db.rollback()
else:
db.commit()
log.write('Starting at ' + str(datetime.now().ctime()) + ':\n')
#need to do co-cz (r?) also so-sz/r, uq-?
for char in 'csu':
for char2 in string.lowercase[14:]:
for char3 in string.lowercase:
search_string = char + char2 + char3
getSearchResults(search_string)
driver.close()
cur.close()
db.close()
log.close()