Python PostingScraperの例、posting_scraper.PostingScraper Pythonの例

コード例 #1

0

ファイルを表示

ファイル: indeed_scraper.py プロジェクト: themikesmith/web_scraperz

 def get_postings(self, query, pages=1):
     try:
         query = quote_plus(query)
         postings = []
         for page_num in range(1, pages + 1):
             search_url = urlunsplit(
                 (
                     self.scheme,
                     self.source,
                     "jobs",
                     "q=%s&l=%s&sort=date&start=%d" % (query, self.location, (page_num - 1) * 10),
                     "",
                 )
             )
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             job_results_td = soup.find("td", attrs=IndeedScraper._job_results_col_attrs)
             try:
                 postings.extend(job_results_td.findAll("div", IndeedScraper._job_row_result_div_attrs))
             except AttributeError:
                 # traceback.print_exc(file=stderr)
                 pass
         return list(set(PostingScraper._remove_none_from_things(map(self._get_info_from_indeed_result, postings))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []

コード例 #2

0

ファイルを表示

ファイル: guru_scraper.py プロジェクト: themikesmith/web_scraperz

 def get_postings(self, query, pages=1):
     postings = []
     try:
         query = re.sub(' ', '-', query)  # funny syntax for guru website
         query = quote_plus(query)
         for page_num in range(1, pages + 1):
             search_url = urlunsplit((self.scheme, self.source, "d/jobs/q/%s/pg/%d" % (query, page_num), "", ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             services_list = soup.find(attrs=GuruScraper._job_search_results_list_attr)
             try:  # handle if there are more pages than results... services_list won't exist
                 for i, li in enumerate(services_list.findAll('li', attrs=GuruScraper._job_search_result_list_item_attrs)):
                     h2 = li.find('h2', attrs=GuruScraper._job_search_results_header_attrs)
                     a = h2.find('a')
                     postings.append(self._clean_post_url(a['href']))
             except (AttributeError, TypeError, KeyError):
                 # also handle misc errors, want to gracefully return postings we already have
                 # traceback.print_exc(file=stderr)
                 pass
         return list(set(PostingScraper._remove_none_from_things(
             map(self._get_info_from_guru_job_page_soup,
                 map(PostingScraper._get_cleaned_soup_from_url, postings)))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []

コード例 #3

0

ファイルを表示

ファイル: indeed_scraper.py プロジェクト: themikesmith/web_scraperz

 def _get_info_from_indeed_result(self, row_result_soup):
     posting = Posting({"source": self.source})
     # url, title
     try:
         url_data = row_result_soup.find("a")
         posting["url"] = self._clean_post_url(url_data["href"])
         posting["title"] = PostingScraper._encode_unicode(url_data.text)
     except (AttributeError, KeyError, TypeError):
         pass
     # id
     try:
         posting["unique_id"] = row_result_soup["data-jk"]
     except KeyError:
         pass
     # location
     try:
         loc = row_result_soup.find("span", IndeedScraper._job_location_span_attrs).text
         if loc is not None:
             g = geocoder.google(loc, method="reverse")
             posting["location"] = (g.city, g.state, g.country)
     except (AttributeError, Exception):
         # traceback.print_exc(file=stderr)
         pass
     # date posted logic
     try:
         date_posted_span = row_result_soup.find("span", attrs=IndeedScraper._job_date_span_attrs)
         date_posted_text = PostingScraper._encode_unicode(date_posted_span.text).lower()
         if date_posted_text == "just posted":
             date_posted_text = "now"
         try:
             posting["date_posted"] = safe_dt_parse(date_posted_text)  # also throws AttributeError
         except (AttributeError, ValueError):
             # traceback.print_exc(file=stderr)
             pass
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     # description
     try:
         posting["description"] = PostingScraper._encode_unicode(
             row_result_soup.find("span", attrs=IndeedScraper._job_description_span_attrs).text
         )
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     if "description" not in posting:  # try summary instead
         try:
             posting["description"] = PostingScraper._encode_unicode(
                 row_result_soup.find("span", attrs=IndeedScraper._job_summary_span_attrs).text
             )
         except AttributeError:
             # traceback.print_exc(file=stderr)
             pass
     return posting

コード例 #4

0

ファイルを表示

ファイル: simplyhired_scraper.py プロジェクト: themikesmith/web_scraperz

 def get_postings(self, query, pages=1):
     try:
         postings = []
         query = quote_plus(query)
         for page_num in range(1, pages+1):
             # https://www.simplyhired.com/search?q=massage+therapist&l=baltimore%2C+md&pn=2
             search_url = urlunsplit((self.scheme, self.source, "search",
                                      "q=%s&l=%s&pn=%d" % (query, quote_plus(self.location.lower()), page_num), ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             job_results_list_div = soup.find('div', attrs=SimplyhiredScraper._job_results_list_div_attrs)
             try:
                 postings.extend(job_results_list_div.findAll('div', SimplyhiredScraper._job_result_div_attrs))
             except AttributeError:
                 # traceback.print_exc(file=stderr)
                 pass
         return list(set(PostingScraper._remove_none_from_things(
             map(self._get_info_from_simplyhired_result, postings))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []

コード例 #5

0

ファイルを表示

ファイル: craigslist_scraper.py プロジェクト: themikesmith/web_scraperz

 def get_postings(self, query, pages=1):
     try:
         query = quote_plus(query)  # don't use urlencode, some sites depend on argument order
         posts = []  # temporary variable to store all of the posting data
         for i in range(1, pages + 1):
             search_url = urlunsplit(
                 (self.scheme, self.source, "/search/ggg", "query=%s&sort=date?s=%d" % (query, i * 100), "")
             )
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             try:
                 posts += [
                     self._clean_post_url(a["href"]) for a in soup.findAll("a", {"data-id": re.compile("\d+")})
                 ]
             except KeyError:  # handle if no href
                 pass
         return list(
             set(PostingScraper._remove_none_from_things([self._get_info_from_clp_posting(post) for post in posts]))
         )
     except Exception:
         traceback.print_exc(file=stderr)
         return []

コード例 #6

0

ファイルを表示

ファイル: upwork_scraper.py プロジェクト: themikesmith/web_scraperz

 def get_postings(self, query, pages=1):
     try:
         query = quote_plus(query)  # don't use urlencode, some sites depend on argument order
         posts = []
         # example url:
         # https://www.upwork.com/o/jobs/browse/?q=therapist
         for i in range(1, pages + 1):
             search_url = urlunsplit((self.scheme, self.source, "/o/jobs/browse/", "page=%d&q=%s" % (i, query), ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             # this url returns a list of postings of profiles. visit each profile
             for article in soup.findAll('article'):  # get all 'article'
                 url = article.find('a', attrs=UpworkScraper._job_search_result_link_attrs)
                 try:
                     posts.append(self._clean_post_url(url['href']))
                 except (TypeError, KeyError):
                     pass
         return list(set(PostingScraper._remove_none_from_things(
             map(self._get_info_from_upwork_posting,
                 map(PostingScraper._get_cleaned_soup_from_url, posts)))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []

コード例 #7

0

ファイルを表示

ファイル: guru_scraper.py プロジェクト: themikesmith/web_scraperz

 def _get_info_from_guru_job_page_soup(self, posting_soup):
     posting = Posting({'source': self.source})
     # url, and unique id
     url = posting_soup.find('meta', attrs=GuruScraper._job_url_meta_attrs)
     if url is not None:
         posting['url'] = self._clean_post_url(url['content'])
         url_parts = urlsplit(posting['url'])
         if url_parts.path:
             sections = url_parts.path.rsplit('/')
             [sections.remove(s) for s in sections if not s]
             posting['unique_id'] = sections[-1]
     # title
     try:
         title_header = posting_soup.find(attrs=GuruScraper._job_title_attrs)
         posting['title'] = PostingScraper._encode_unicode(title_header.text)
     except AttributeError:
         pass
     # date posted
     try:
         date_posted_span = posting_soup.find('span', attrs=GuruScraper._job_date_posted_span_attrs)
         posting['date_posted'] = safe_dt_parse(date_posted_span['data-date'])
     except (KeyError, AttributeError, ValueError):
         # traceback.print_exc(file=stderr)
         pass
     # duration
     try:
         duration_span = posting_soup.find('span', attrs=GuruScraper._job_duration_span_attrs)
         actual_date_span = duration_span.find('span')
         posting['duration'] = safe_dt_parse(actual_date_span['data-date'])
     except (KeyError, AttributeError, ValueError, TypeError):
         # traceback.print_exc(file=stderr)
         pass
     # budget
     try:
         budget_div = posting_soup.find('div', attrs=GuruScraper._job_budget_div_attrs)
         posting['budget'] = PostingScraper._encode_unicode(budget_div.text)
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     # skills
     try:
         skills_section = posting_soup.find(attrs=GuruScraper._job_skill_section_attrs)
         posting['skills'] = map(lambda x:
                                 PostingScraper._encode_unicode(PostingScraper._get_cleaned_soup_from_html(x).text),
                                 skills_section.find('a', attrs=GuruScraper._job_skill_link_attrs))
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     # experience, desription
     try:
         description_section = posting_soup.find(attrs=GuruScraper._job_experience_reqs_section_attrs)
         posting['description'] = PostingScraper._encode_unicode(
             PostingScraper._get_cleaned_soup_from_html(str(description_section)).text)
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     return posting

コード例 #8

0

ファイルを表示

ファイル: craigslist_scraper.py プロジェクト: themikesmith/web_scraperz

 def _get_info_from_clp_posting(self, url):
     posting = Posting({"source": self.source})
     soup = PostingScraper._get_cleaned_soup_from_url(url)
     posting["url"] = url
     try:
         posting["title"] = PostingScraper._encode_unicode(soup.find(attrs=CraigslistScraper._post_title_attrs).text)
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     try:
         loc = soup.find(href=re.compile("google.com/maps"))["href"]
         g = geocoder.google(loc[loc.find("@") + 1 : -1].split(",")[:-1], method="reverse")
         posting["state"] = g.state
         posting["city"] = g.city
         posting["country"] = g.country
     except:
         pass
     try:
         posting["price"] = PostingScraper._encode_unicode(
             soup.find(text=re.compile(".*compensation.*")).parent.findChild("b").text
         ).replace("$", "")
     except:
         pass
     try:
         posting["description"] = PostingScraper._encode_unicode(soup.find("section", {"id": "postingbody"}).text)
     except:
         pass
     try:
         posting["date_posted"] = soup.find("time")["datetime"]
     except:
         pass
     try:
         posting["unique_id"] = "clgig" + re.match("[^\d]*(\d+).html", url).group(1)
     except:
         pass
     return posting

コード例 #9

0

ファイルを表示

ファイル: ziprecruiter_scraper.py プロジェクト: themikesmith/web_scraperz

 def get_postings(self, query, pages=1):
     try:
         postings = []
         query = quote_plus(query)
         for page_num in range(1, pages+1):
             # https://www.ziprecruiter.com/candidate/search?sort=best-match&search=writer&page=2&location=baltimore
             search_url = urlunsplit((self.scheme, self.source, "candidate/search",
                                      "sort=best-match&search=%s&location=%s&page=%d" %
                                      (query, quote_plus(self.location.lower()), page_num), ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             job_results_list_div = soup.find('div', attrs=ZipRecruiterScraper._job_list_div_attrs)
             try:
                 postings.extend(map(lambda x: x['href'],
                                     job_results_list_div.findAll('a', ZipRecruiterScraper._job_result_link_attrs)))
             except AttributeError:
                 # traceback.print_exc(file=stderr)
                 pass
         # note that we could return None if we go to an external url here
         postings = map(self._get_info_from_ziprecruiter_result, postings)
         return list(set(PostingScraper._remove_none_from_things(postings)))
     except Exception:
         traceback.print_exc(file=stderr)
         return []

コード例 #10

0

ファイルを表示

ファイル: indeed_scraper.py プロジェクト: themikesmith/web_scraperz

 def __init__(self, location=""):
     PostingScraper.__init__(self, "http://www.indeed.com")
     if location:
         self.location = location
     else:
         self.location = ""

コード例 #11

0

ファイルを表示

ファイル: upwork_scraper.py プロジェクト: themikesmith/web_scraperz

 def _get_info_from_upwork_posting(self, posting_soup):
     """
     Given an Upwork article HTML object, extract the desired information and return as a dict
     :param posting_soup: the Soup-ed HTML
     :return: the data in a dict
     """
     posting = Posting({'source': self.source})
     # url
     try:
         url = posting_soup.find('meta', attrs=UpworkScraper._job_url_attrs)
         if url is not None:
             posting['url'] = self._clean_post_url(url['content'])
             posting['title'] = PostingScraper._encode_unicode(url.text)
             url_parts = urlsplit(posting['url'])
             if url_parts.path:
                 sections = url_parts.path.rsplit('/')
                 [sections.remove(s) for s in sections if not s]
                 posting['unique_id'] = sections[-1]
     except (KeyError, AttributeError):
         # traceback.print_exc(file=stderr)
         pass
     container = posting_soup.find(attrs=UpworkScraper._job_container_attrs)
     # date posted
     date_posted_span = container.find(attrs=UpworkScraper._job_dateposted_attrs)
     try:  # it's in the 'popover' attribute
         posting['date_posted'] = safe_dt_parse(date_posted_span['popover'])
     except (KeyError, AttributeError, ValueError, TypeError):
         # traceback.print_exc(file=stderr)
         pass
     # price
     # second row of container, first element, first row inside that
     try:
         second_row = container.findAll('div', attrs=UpworkScraper._div_row_attrs)[1]
         try:
             first_child = second_row.find('div')
             price_row = first_child.find('div', attrs=UpworkScraper._div_row_attrs)
             try:
                 posting['price_info'] = PostingScraper._encode_unicode(price_row.text)
             except AttributeError:  # thrown if price_row is None
                 # traceback.print_exc(file=stderr)
                 pass
         except IndexError:  # thrown if second_row doesn't have a first_child
             # traceback.print_exc(file=stderr)
             pass
     except IndexError:  # thrown if container doesn't have a second 'row' tag
         # traceback.print_exc(file=stderr)
         pass
     # text
     try:
         description_air_card = container.find('div', attrs=UpworkScraper._job_descrip_aircard_attrs)
         posting['description'] = PostingScraper._encode_unicode(description_air_card.find('p').text)
     except AttributeError:  # handle if soup finds nothing
         # traceback.print_exc(file=stderr)
         pass
     # skills
     try:
         posting['skills'] = map(lambda x: PostingScraper._encode_unicode(x.text),
                                 container.findAll('a', attrs=UpworkScraper._job_skill_tag_attrs))
     except AttributeError:  # handle if soup finds nothing for skills
         pass
     # unique id
     return posting

コード例 #12

0

ファイルを表示

ファイル: upwork_scraper.py プロジェクト: themikesmith/web_scraperz

 def __init__(self):
     PostingScraper.__init__(self, "http://www.upwork.com")

コード例 #13

0

ファイルを表示

ファイル: ziprecruiter_scraper.py プロジェクト: themikesmith/web_scraperz

 def __init__(self, location=""):
     PostingScraper.__init__(self, "http://www.ziprecruiter.com")
     if location:
         self.location = location
     else:
         self.location = ""

コード例 #14

0

ファイルを表示

ファイル: craigslist_scraper.py プロジェクト: themikesmith/web_scraperz

 def __init__(self, base_url):
     # if base == 'baltimore':
     #     base = 'http://baltimore.craigslist.org'
     if base_url is None or not base_url:
         raise ValueError("A base url must be specified for a Craigslist Scraper.")
     PostingScraper.__init__(self, base_url)

コード例 #15

0

ファイルを表示

ファイル: ziprecruiter_scraper.py プロジェクト: themikesmith/web_scraperz

 def _get_info_from_ziprecruiter_result(self, job_link):
     # print >> stderr, job_link
     soup = PostingScraper._get_cleaned_soup_from_url(job_link)
     if not len(soup):
         # print >> stderr, "returning none, soup false, link:%s" % job_link
         return None
     posting = Posting()
     # url
     try:
         url_meta = soup.find('meta', attrs=ZipRecruiterScraper._job_posting_ogurl_attrs)
         url = url_meta['content']
         posting.add_url(url)
         # id is the last series of alphanumeric characters after the last hyphen in the url path
         # e.g., /jobs/proedit-inc-18374379/contract-scientific-writer-2ccbf90f
         # would mean 2ccbf90f
         things = urlsplit(url)
         path = things[2]
         last_hyphen = path.rfind('-')
         if last_hyphen != -1:
             # print >> stderr, path[last_hyphen+1:]
             posting.add_id(path[last_hyphen+1:])
         else:
             # just take the whole url after the base
             # print >> stderr, "couldn't find id in url:%s" % url
             # print >> stderr, "making id:", path
             posting.add_id(path)
     except (TypeError, IndexError):
         # traceback.print_exc(file=stderr)
         pass
     # source
     posting.add_source(self.source)
     # title
     try:
         title_h1 = soup.find('h1', attrs=ZipRecruiterScraper._job_posting_h1_title_attrs)
         posting.add_title(PostingScraper._encode_unicode(title_h1.text))
     except AttributeError:  # if title_h1 is None
         # traceback.print_exc(file=stderr)
         pass
     # location
     try:
         # try to do the following first, more exact
         geoloc_meta = soup.find('meta', attrs=ZipRecruiterScraper._job_posting_latlong_attrs)['content']
         geoloc_meta = re.sub(";", ",", geoloc_meta)
         g = geocoder.google(geoloc_meta.split(','), method='reverse')
         # print >> stderr, "reverse by latlong, loc:%s => g:%s" % (geoloc_meta, str(g))
         posting['state'] = g.state
         posting['city'] = g.city
         posting['country'] = g.country
     except (TypeError, AttributeError, KeyError):
         # try to find the google map link
         try:
             maps_url = soup.find(href=re.compile(r"google\.com/maps|maps\.google\.com"))['href']
             things = urlsplit(maps_url)
             params = things[3]
             loc = parse_qs(params)
             loc_str = loc['q'][0]
             g = geocoder.google(loc_str)
             # print >> stderr, "normal by loc:%s => g:%s" % (loc_str, g)
             posting['state'] = g.state
             posting['city'] = g.city
             posting['country'] = g.country
         except (TypeError, AttributeError, KeyError, IndexError):
             # traceback.print_exc(file=stderr)
             pass
     # date posted
     try:
         try:  # try this first, but if we fail...
             date_posted_p = soup.find('p', attrs=ZipRecruiterScraper._job_posting_p_date_posted_attrs)
             # find first 'span'
             date_posted_span = date_posted_p.find('span')
             dt_text = re.sub(r"[Pp]osted", "", date_posted_span.text.lower()).strip()
             posting.add_date_posted(safe_dt_parse(dt_text))
         except AttributeError:
             try:
                 # ... double-check that we have a 'posted today / this week / 12 hours ago / whatever'
                 locdiv = soup.find('div', attrs=ZipRecruiterScraper._job_posting_locdiv_attrs)
                 header_div = locdiv.parent
                 date_p = header_div.findAll('p')[-1]
                 date_span = date_p.find('span')
                 date_span_text = date_span.text.lower()
                 if "posted" in date_span_text:
                     # and if so, take appropriate action
                     dt_text = re.sub(r"posted", "", date_span_text).strip()
                     posting.add_date_posted(safe_dt_parse(dt_text))
                 else:
                     # print >> stderr, "don't have a date found at url:%s" % job_link
                     pass
             except (AttributeError, IndexError):
                 # traceback.print_exc(file=stderr)
                 pass
     except ValueError as e:
         print >> stderr, "error parsing date posted string at url:%s" % job_link
         pass
     # description
     try:
         description_div = soup.find('div', attrs=ZipRecruiterScraper._job_posting_div_description_attrs)
         posting.add_description(PostingScraper._encode_unicode(description_div.text))
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     return posting

コード例 #16

0

ファイルを表示

ファイル: simplyhired_scraper.py プロジェクト: themikesmith/web_scraperz

 def _get_info_from_simplyhired_result(self, result_soup):
     posting = Posting({'source': self.source})
     # external url, title
     try:
         title_link = result_soup.find('a', attrs=SimplyhiredScraper._job_title_link_attrs)
         posting['external_url'] = self._clean_post_url(title_link['href'])
         posting['title'] = PostingScraper._encode_unicode(title_link.text)
     except (AttributeError, KeyError, TypeError):
         # traceback.print_exc(file=stderr)
         pass
     # url
     description_page_url = None
     try:
         tools_container = result_soup.find('div', attrs=SimplyhiredScraper._job_tools_container_attrs)
         tools_links = tools_container.findAll('a')
         description_page_url = tools_links[-1]['href']
         posting['url'] = description_page_url
         posting['unique_id'] = description_page_url  # TODO i couldn't actually find a unique id?
     except (KeyError, AttributeError, IndexError):
         # traceback.print_exc(file=stderr)
         pass
     if description_page_url is not None:
         # follow posting url to long description, and date posted
         description_page_soup = PostingScraper._get_cleaned_soup_from_url(description_page_url)
         try:
             info_table = description_page_soup.find('table',
                                                     attrs=SimplyhiredScraper._description_page_info_table_attrs)
             # 4 rows: Company, Location, Date Posted, Source
             row_data_two = []
             trs = info_table.findAll('tr')
             for tr in trs:
                 tds = tr.findAll('td')
                 try:
                     last_td = tds[-1]
                     row_data_two.append(PostingScraper._encode_unicode(last_td.text))
                 except IndexError:
                     # traceback.print_exc(file=stderr)
                     pass
             info_labels = info_table.findAll('td', attrs=SimplyhiredScraper._description_page_table_info_label_attrs)
             info_labels = map(lambda x: PostingScraper._encode_unicode(x.text).lower(), info_labels)
             table_data = zip(info_labels, row_data_two)
             for label, value in table_data:
                 if not value.strip():
                     continue
                 if 'location' in label:
                     try:
                         g = geocoder.google(value, method='reverse')
                         posting['location'] = (g.city, g.state, g.country)
                     except Exception:
                         # traceback.print_exc(file=stderr)
                         pass
                 elif 'date posted' in label:
                     try:
                         posting['date_posted'] = safe_dt_parse(value)
                     except (AttributeError, ValueError):
                         # traceback.print_exc(file=stderr)
                         pass
                 elif 'source' in label:
                     posting['external_source'] = value
                 elif 'company' in label:
                     posting['company'] = value
         except AttributeError:
             # traceback.print_exc(file=stderr)
             pass
         # description
         try:
             description_div = description_page_soup.find('div', attrs=SimplyhiredScraper._description_page_description_attrs)
             posting['description'] = PostingScraper._encode_unicode(description_div.text)
         except AttributeError:
             # traceback.print_exc(file=stderr)
             pass
     return posting

コード例 #17

0

ファイルを表示

ファイル: guru_scraper.py プロジェクト: themikesmith/web_scraperz

 def __init__(self):
     PostingScraper.__init__(self, "http://www.guru.com")