-
Notifications
You must be signed in to change notification settings - Fork 1
/
lead_scraper.py
176 lines (153 loc) · 6.66 KB
/
lead_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
from lxml import html
from lxml.html.clean import Cleaner
import lxml
import requests
import sqlite3
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email.MIMEText import MIMEText
from email import Encoders
import time
class Spider(object):
"""All other crawler classes will inherit from this one to reduce code"""
def __init__(self, searchterm, city, province):
self.searchterm = searchterm
self.city = city
self.province = province
class Indeed(object):
"""indeed.ca scraper. Takes in url parameter including search terms and location that
will be fed into the url"""
def __init__(self, searchterm, city, province):
self.searchterm = searchterm
self.city = city
self.province = province
def crawl(self):
# count starts at first page
crawling = True
count = 0
time.sleep(5)
while crawling:
searchterm = self.searchterm
city = self.city
prov = self.province
# url = "http://ca.indeed.com/jobs?q="+searchterm+'&l='+city+"%2C+"+prov+'&start='+str(count)
url = "http://ca.indeed.com/jobs?q={0}&l=+{1}+%2C{2}&start={3}".format(searchterm, city, prov, str(count))
print(url, 'current URL')
page = requests.get(url)
tree = html.fromstring(page.text)
# cleans html by removing <b></b> tags in the description
# These tags caused a bug where the descriptions were fragmented on multiple rows
cleaner = Cleaner()
cleaner.remove_tags = ['b']
tree = cleaner.clean_html(tree)
jobtitles = tree.xpath('//h2[@class="jobtitle"]/a/text()')
joblinks = tree.xpath('//h2[@class="jobtitle"]/a/@href')
job_descriptions = tree.xpath('//span[@class="summary"]/text()')
jobtitles = (job.lstrip() for job in jobtitles)
joblinks = (job.lstrip() for job in joblinks)
job_descriptions = (job for job in job_descriptions)
Database.add_entry(zip(jobtitles, joblinks, job_descriptions))
link_pages = tree.xpath('//div[@class="pagination"]/a/@href')
print(link_pages, 'link_pages')
# look for next button
# if no longer present it means we have reached the last page
next_button = tree.xpath('//*[@id="resultsCol"]/div/a/span/span/text()')
next_button_str = ''.join(next_button)
print(next_button)
if u'Next' in next_button_str:
print('found next will continue scraping...')
else:
print('Hit last page, crawler will stop...')
crawling = False
for page in link_pages:
# takes digits from end of url
# takes last 6 characters, unlikely that the number would be any bigger
p = page[-6:]
digits_url = ''.join([d for d in p if d.isdigit()])
try:
print(digits_url, 'digits url')
if digits_url > count:
print(page, 'page')
count = int(digits_url)
print(count, 'count')
else:
print('You probably broke your conditional statement...')
print(digits_url, 'current count {}'.format(count))
except ValueError:
# print("We're on the first page so no int in the page url")
print('This failed', digits_url)
class Database(object):
@staticmethod
def add_entry(job_offers):
conn = sqlite3.connect('jobs.db')
with conn:
c = conn.cursor()
# Create table if needed
table = 'CREATE TABLE IF NOT EXISTS jobs (id INTEGER PRIMARY KEY, job_titles TEXT, job_links TEXT, job_descriptions TEXT)'
c.execute(table)
# Insert all data entry
c.executemany('''INSERT INTO jobs (job_titles, job_links, job_descriptions) VALUES (?,?,?)''', job_offers)
@staticmethod
def filter_jobs(search_term):
"""searches the sql db for a job keyword.
Prints out the entire row for each match
"""
conn = sqlite3.connect('jobs.db')
with conn:
c = conn.cursor()
c.execute('''SELECT * FROM jobs''')
search_title = [row for row in c if search_term.lower() in row[1].lower()]
# Need another c.execute otherwise I can't access the db a second time
c.execute('''SELECT * FROM jobs''')
search_desc = [col for col in c if search_term.lower() in col[3].lower()]
search_terms = [search_title, search_desc]
# print search_desc, 'DESC'
# print search_title, 'TITLE'
# Removes doubles. there can be doubles if keywords match both job_title and job_desc for the same job
search_terms = set(tuple(i) for i in search_terms)
try:
print(search_terms)
return search_terms
except IndexError:
print('Search term is most likely not in the database.' \
'\nTry scraping for the search term before running send_mail module.')
def send_mail(recipient, jobs):
"""Email module works as long as I don't use yahoo. gmail works as an alternative"""
str_jobs = ''
for c in str(jobs):
str_jobs += c
str_jobs = str_jobs.encode('utf-8')
To = recipient
msg = MIMEMultipart('alternative')
msg['Subject'] = 'Job found matching {} description.'
msg['From'] = sender
msg['To'] = recipient
body = MIMEText(str_jobs, 'plain', 'UTF-8')
msg.attach(body)
# yahoo is the problem use google or another smtp provider
# server = smtplib.SMTP('smtp.mail.yahoo.com', 587)
server = smtplib.SMTP('smtp-mail.outlook.com', 25)
server.ehlo()
server.starttls()
server.ehlo()
server.login(sender, password)
print('sending mail..')
server.sendmail(sender, [To], msg.as_string())
print('mail sent')
server.quit()
# print 'done sending mail...'
def main():
"""Run test object first to scrape and populatre SQL DB
then run Database.filter_jobs to find what you're looking for
ex: Indeed(job, city, province/state)
"""
test = Indeed('it', 'Montréal', 'QC')
test.crawl()
# search_terms = Database.filter_jobs('trainsim')
# search_terms = Database.filter_jobs
# send_mail('recipient@email.com', search_terms('manager'))
if __name__ == '__main__':
main()