-
Notifications
You must be signed in to change notification settings - Fork 0
/
simplyhired_scraper.py
123 lines (117 loc) · 5.8 KB
/
simplyhired_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from urllib import quote_plus
import re
import geocoder
from super_dt_parser import safe_dt_parse
from posting_class import Posting
from posting_scraper import PostingScraper
from sys import stderr
from urlparse import urlunsplit
import traceback
__author__ = 'mcs'
class SimplyhiredScraper(PostingScraper):
_job_results_list_div_attrs = {"class": "results"}
_job_result_div_attrs = {"class": "job", "itemtype": re.compile(r"JobPosting")}
_job_title_link_attrs = {"class": "title"}
_job_tools_container_attrs = {"class": "tools_container"}
_description_page_info_table_attrs = {"class": "info-table"}
_description_page_description_attrs = {"class": "job-description"}
_description_page_table_info_label_attrs = {"class": "info-label"}
def __init__(self, location=""):
PostingScraper.__init__(self, "http://www.simplyhired.com")
if location:
self.location = location
else:
self.location = ""
def _get_info_from_simplyhired_result(self, result_soup):
posting = Posting({'source': self.source})
# external url, title
try:
title_link = result_soup.find('a', attrs=SimplyhiredScraper._job_title_link_attrs)
posting['external_url'] = self._clean_post_url(title_link['href'])
posting['title'] = PostingScraper._encode_unicode(title_link.text)
except (AttributeError, KeyError, TypeError):
# traceback.print_exc(file=stderr)
pass
# url
description_page_url = None
try:
tools_container = result_soup.find('div', attrs=SimplyhiredScraper._job_tools_container_attrs)
tools_links = tools_container.findAll('a')
description_page_url = tools_links[-1]['href']
posting['url'] = description_page_url
posting['unique_id'] = description_page_url # TODO i couldn't actually find a unique id?
except (KeyError, AttributeError, IndexError):
# traceback.print_exc(file=stderr)
pass
if description_page_url is not None:
# follow posting url to long description, and date posted
description_page_soup = PostingScraper._get_cleaned_soup_from_url(description_page_url)
try:
info_table = description_page_soup.find('table',
attrs=SimplyhiredScraper._description_page_info_table_attrs)
# 4 rows: Company, Location, Date Posted, Source
row_data_two = []
trs = info_table.findAll('tr')
for tr in trs:
tds = tr.findAll('td')
try:
last_td = tds[-1]
row_data_two.append(PostingScraper._encode_unicode(last_td.text))
except IndexError:
# traceback.print_exc(file=stderr)
pass
info_labels = info_table.findAll('td', attrs=SimplyhiredScraper._description_page_table_info_label_attrs)
info_labels = map(lambda x: PostingScraper._encode_unicode(x.text).lower(), info_labels)
table_data = zip(info_labels, row_data_two)
for label, value in table_data:
if not value.strip():
continue
if 'location' in label:
try:
g = geocoder.google(value, method='reverse')
posting['location'] = (g.city, g.state, g.country)
except Exception:
# traceback.print_exc(file=stderr)
pass
elif 'date posted' in label:
try:
posting['date_posted'] = safe_dt_parse(value)
except (AttributeError, ValueError):
# traceback.print_exc(file=stderr)
pass
elif 'source' in label:
posting['external_source'] = value
elif 'company' in label:
posting['company'] = value
except AttributeError:
# traceback.print_exc(file=stderr)
pass
# description
try:
description_div = description_page_soup.find('div', attrs=SimplyhiredScraper._description_page_description_attrs)
posting['description'] = PostingScraper._encode_unicode(description_div.text)
except AttributeError:
# traceback.print_exc(file=stderr)
pass
return posting
def get_postings(self, query, pages=1):
try:
postings = []
query = quote_plus(query)
for page_num in range(1, pages+1):
# https://www.simplyhired.com/search?q=massage+therapist&l=baltimore%2C+md&pn=2
search_url = urlunsplit((self.scheme, self.source, "search",
"q=%s&l=%s&pn=%d" % (query, quote_plus(self.location.lower()), page_num), ""))
print >> stderr, search_url
soup = PostingScraper._get_cleaned_soup_from_url(search_url)
job_results_list_div = soup.find('div', attrs=SimplyhiredScraper._job_results_list_div_attrs)
try:
postings.extend(job_results_list_div.findAll('div', SimplyhiredScraper._job_result_div_attrs))
except AttributeError:
# traceback.print_exc(file=stderr)
pass
return list(set(PostingScraper._remove_none_from_things(
map(self._get_info_from_simplyhired_result, postings))))
except Exception:
traceback.print_exc(file=stderr)
return []