-
Notifications
You must be signed in to change notification settings - Fork 0
/
upwork_scraper.py
111 lines (104 loc) · 5.03 KB
/
upwork_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from urllib import quote_plus
import re
from super_dt_parser import safe_dt_parse
from posting_class import Posting
from posting_scraper import PostingScraper
from sys import stderr
from urlparse import urlsplit, urlunsplit
import traceback
__author__ = 'mcs'
class UpworkScraper(PostingScraper):
_job_search_result_link_attrs = {"class": "break", "itemprop": "url"}
_job_url_attrs = {"property": "og:url"}
_job_container_attrs = {"class": "container", "itemtype": "http://schema.org/JobPosting"}
_job_dateposted_attrs = {"itemprop": "datePosted"}
_job_descrip_aircard_attrs = {"class": "air-card-group"}
_job_skill_tag_attrs = {"class": re.compile("^o-tag-skill")}
_div_row_attrs = {'class': 'row'}
def __init__(self):
PostingScraper.__init__(self, "http://www.upwork.com")
def _get_info_from_upwork_posting(self, posting_soup):
"""
Given an Upwork article HTML object, extract the desired information and return as a dict
:param posting_soup: the Soup-ed HTML
:return: the data in a dict
"""
posting = Posting({'source': self.source})
# url
try:
url = posting_soup.find('meta', attrs=UpworkScraper._job_url_attrs)
if url is not None:
posting['url'] = self._clean_post_url(url['content'])
posting['title'] = PostingScraper._encode_unicode(url.text)
url_parts = urlsplit(posting['url'])
if url_parts.path:
sections = url_parts.path.rsplit('/')
[sections.remove(s) for s in sections if not s]
posting['unique_id'] = sections[-1]
except (KeyError, AttributeError):
# traceback.print_exc(file=stderr)
pass
container = posting_soup.find(attrs=UpworkScraper._job_container_attrs)
# date posted
date_posted_span = container.find(attrs=UpworkScraper._job_dateposted_attrs)
try: # it's in the 'popover' attribute
posting['date_posted'] = safe_dt_parse(date_posted_span['popover'])
except (KeyError, AttributeError, ValueError, TypeError):
# traceback.print_exc(file=stderr)
pass
# price
# second row of container, first element, first row inside that
try:
second_row = container.findAll('div', attrs=UpworkScraper._div_row_attrs)[1]
try:
first_child = second_row.find('div')
price_row = first_child.find('div', attrs=UpworkScraper._div_row_attrs)
try:
posting['price_info'] = PostingScraper._encode_unicode(price_row.text)
except AttributeError: # thrown if price_row is None
# traceback.print_exc(file=stderr)
pass
except IndexError: # thrown if second_row doesn't have a first_child
# traceback.print_exc(file=stderr)
pass
except IndexError: # thrown if container doesn't have a second 'row' tag
# traceback.print_exc(file=stderr)
pass
# text
try:
description_air_card = container.find('div', attrs=UpworkScraper._job_descrip_aircard_attrs)
posting['description'] = PostingScraper._encode_unicode(description_air_card.find('p').text)
except AttributeError: # handle if soup finds nothing
# traceback.print_exc(file=stderr)
pass
# skills
try:
posting['skills'] = map(lambda x: PostingScraper._encode_unicode(x.text),
container.findAll('a', attrs=UpworkScraper._job_skill_tag_attrs))
except AttributeError: # handle if soup finds nothing for skills
pass
# unique id
return posting
def get_postings(self, query, pages=1):
try:
query = quote_plus(query) # don't use urlencode, some sites depend on argument order
posts = []
# example url:
# https://www.upwork.com/o/jobs/browse/?q=therapist
for i in range(1, pages + 1):
search_url = urlunsplit((self.scheme, self.source, "/o/jobs/browse/", "page=%d&q=%s" % (i, query), ""))
print >> stderr, search_url
soup = PostingScraper._get_cleaned_soup_from_url(search_url)
# this url returns a list of postings of profiles. visit each profile
for article in soup.findAll('article'): # get all 'article'
url = article.find('a', attrs=UpworkScraper._job_search_result_link_attrs)
try:
posts.append(self._clean_post_url(url['href']))
except (TypeError, KeyError):
pass
return list(set(PostingScraper._remove_none_from_things(
map(self._get_info_from_upwork_posting,
map(PostingScraper._get_cleaned_soup_from_url, posts)))))
except Exception:
traceback.print_exc(file=stderr)
return []