-
Notifications
You must be signed in to change notification settings - Fork 2
/
profilefinder.py
233 lines (189 loc) · 6.77 KB
/
profilefinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
import csv
import sys
import re
import urllib
from urlparse import urlparse
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup
import liparser
import piplsearch
import utils
import pdb
def clean_urls(dirty_url):
""" Strip protocol and trailing '/' from blog info profided by GitHub profile"""
url = urlparse(dirty_url)
if url.path == '/':
clean_url = url.netloc
else:
clean_url = url.netloc + url.path
return clean_url
def normalize(s):
# add line to remove accents later using: unicodedata.normalize('NFKD', string)
if s:
s = s.lower()
return s
else:
return
def google_for_li_matches(person):
"""
Googles a name to find promising LinkedIn profiles. For exact name matches
from the first page of results, returns the LinkedIn public profile urls in
a list.
"""
gh_name = person.get('name')
gh_city = person.get('city')
li_profiles = []
# perform the Google search
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.userAgent'] = ("Mozilla/5.0 (X11; Linux x86_64) \
AppleWebKit/53 " "(KHTML, like Gecko) Chrome/15.0.87")
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get('http://www.google.com')
input_box = driver.find_element_by_name("q")
input_box.send_keys(gh_name + " " + gh_city + " software" + " site:linkedin.com")
input_box.submit()
try:
WebDriverWait(driver, 5).until(EC.title_contains(gh_name))
page = driver.page_source
except:
print "Timeout googleing for %s." % gh_name
return
finally:
driver.quit()
# parse the search results
url_pattern = re.compile(r'q=(h.+?)&')
soup = BeautifulSoup(page, "html5lib")
results = soup.select('h3.r')
for result in results:
# test each results for an exact name match
li_name = result.a.get_text()
li_name_cleaned = li_name[:li_name.find('|')].strip()
if normalize(li_name_cleaned) == normalize(gh_name):
# get the link to this LinkedIn profile
match = re.search(url_pattern, result.a['href'])
if match:
match_url = {'url': match.group(1), 'score':'', 'parsed_profile':''}
li_profiles.append(match_url)
return li_profiles
def evaluate_li_matches(person):
"""
Receives a list of linkedIn profile URLs that might belong to "person".
Tries to load, parse and score each potential match. Returns the sorted
list (best match first).
"""
for match in person['li_matches']:
# fetch and parse the LinkedIn profile
parsed_profile = liparser.parse_li_profile(match.get('url'), person.get('name'))
# if we are able to parse the profile, then score it
if parsed_profile: #.get('name'):
match['score'] = score_li_matches(person, parsed_profile)
match['parsed_profile'] = parsed_profile
else:
match['score'] = -1 # unable to parse the LinkedIn page
# sort from highest scoring match to lowest
return sorted(person['li_matches'], key =lambda k: k['score'], reverse=True)
def score_li_matches(gh, li):
"""
Receives a GitHub developer's profile (gh) and a parsed LinkedIn
profile (li). Tries to find information in the li that matches
that of the gh. Scores according to how many common items they have.
"""
def test_location():
score = 0
if gh.get('city') in li.get('location'):
score = 25
return score
def test_employment():
score = 0
job = ''
jobs = []
if gh.get('company'):
# get company listed in the GitHub profile
job = normalize(gh.get('company'))
# match GitHub company against LinkedIn employment history.
if li.get('employment'):
jobs = map(normalize, [position['company name'] for position in li['employment']])
if job in jobs:
score += 50
# match GitHub company against employer in the Linkedin profile Headline.
if li.get('headline'):
if li['headline']['employer']:
jobs = normalize(li['headline']['employer'])
if job in jobs:
score += 50
# elif
# TODO: match email and personal website domain against LinkedIn employment history.
else:
# we have nothing in GitHub we can use to match agaist LinkedIn employment history.
pass
return score
def test_websites():
score = 0
# blog listed on GitHub user profile page
web = set([gh.get('website')])
# we add the GitHub user's profile page
web.add(gh.get('github_url'))
# websites listed on the Linkedin profile we are scoring
webs = set(li.get('websites').values())
# compare the two sets
if web & webs:
score = 100 * len(web & webs)
return score
return sum([test_location(), test_employment(), test_websites()])
def try_piplsearch(person):
dic = {}
results = piplsearch.pipl_search({'email': person.get('email')})
# if found some LinkedIN profiles, convert to std form
for socmedia in [socmedia for socmedia in results if socmedia['site_name']=='LinkedIn']:
parsed_profile = parse_a_li_profile(socmedia.get('url'), dev.get('name'))
dic = {'url': socmedia.get('url'), 'score': 99, 'parsed_profile': parsed_profile}
dev['li_matches'].append(dic)
return dic
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
devs, log = utils.setup()
# Use Google to find potential LinkedIn matches
for dev in devs:
try:
print'\rGoogling for matches for %s...' % dev.get('name')
dev['li_matches'] = google_for_li_matches(dev)
except:
print "%s occurred while processing: %s" % (sys.exc_info()[0].__name__,dev['name'])
continue
utils.save_as_json(devs, 'googlesearchresults')
print "Done.\n"
# Compare the LinkedIn profiles to the GitHub profile, score them and sort them. Return best at index[0]
devs = utils.load_json('googlesearchresults')
for dev in devs:
print '\rEvaluating matches for %s...' % dev.get('name')
dev['li_matches'] = evaluate_li_matches(dev)
utils.save_as_json(devs, 'scoredresults')
print "Done.\n"
"""
# Use Pipl to match remainder
for dev in [dev for dev in devs if dev['email']]:
if dev['li_matches']:
if dev['li_matches'][0]['score'] < 75:
print "Trying piplsearch for %s..." % dev.get('name')
result = try_piplsearch(dev)
if result:
dev['li_matches'].append(result)
# re-sort from highest scoring match to lowest
dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True)
else:
print "Trying piplsearch for %s..." % dev.get('name')
result = try_piplsearch(dev)
if result:
dev['li_matches'].append(result)
# re-sort from highest scoring match to lowest
dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True)
"""
# See the results of running through both algos
#devs = utils.load_json('scoredresults')
utils.print_results(devs)
#utils.sanity_check(devs)