/
employee_search.py
123 lines (111 loc) · 4.62 KB
/
employee_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from splinter import Browser
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib
import time
from google import Google
import rq
from queue import RQueue
from rq import Queue
from worker import conn
from crawl import *
from fuzzywuzzy import fuzz
from jigsaw import *
q = Queue(connection=conn)
class LinkedinTitleDir:
def test(self, company_name):
job = rq.get_current_job()
print job.meta.keys()
if "queue_name" in job.meta.keys():
print RQueue()._has_completed(job.meta["queue_name"])
print RQueue()._has_completed("queue_name")
if RQueue()._has_completed(job.meta["queue_name"]):
q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
def _search(self, company_name, api_key=""):
qry = 'site:linkedin.com inurl:"at-{0}" inurl:title -inurl:job'
#TODO - remove, all [".","'",","]
name = company_name.strip().lower().replace(" ","-")
dirs = Google().search(qry.format(name), 1)
for url in dirs.url:
q.enqueue(LinkedinTitleDir().parse, url, company_name)
def parse(self, url, company_name):
cache = Google().cache(url)
soup = BeautifulSoup(cache)
p = []
for i in soup.find_all("div",{"class":"entityblock"}):
try:
img = i.find("img")["data-delayed-url"]
except:
img = i.find("img")["src"]
profile = i.find("a")["href"]
name = i.find("h3",{"class":"name"})
name = name.text if name else ""
title = i.find("p",{"class":"headline"})
title = title.text if title else ""
company = title.split("at ")[-1]
title = title.split(" at ")[0]
city = i.find("dd")
city = city.text if city else ""
cols = ["img","profile","name","title","city", "company"]
vals = [img, profile, name, title, city, company]
print vals
p.append(dict(zip(cols, vals)))
print p
results = pd.DataFrame(p)
if " " in company_name:
results['company_score'] = [fuzz.partial_ratio(company_name, company)
for company in results.company]
else:
results['company_score'] = [fuzz.ratio(company_name, company)
for company in results.company]
results = results[(results.company_score > 64)]
data = {'data': results.to_dict("r"), 'company_name':company_name}
CompanyExtraInfoCrawl()._persist(data, "employees", "")
job = rq.get_current_job()
print job.meta.keys()
if "queue_name" in job.meta.keys():
if RQueue()._has_completed(job.meta["queue_name"]):
q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
return p
class GoogleSearch:
def test(self, company_name):
job = rq.get_current_job()
print job.meta.keys()
if "queue_name" in job.meta.keys():
print RQueue()._has_completed(job.meta["queue_name"])
print RQueue()._has_completed("queue_name")
if RQueue()._has_completed(job.meta["queue_name"]):
q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
def _employees(self, company_name="", keyword=""):
''' Linkedin Scrape '''
# TODO - add linkedin directory search
''' Linkedin Scrape'''
args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"'
args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
qry = '"at {0}" {1} {2} site:linkedin.com'
qry = qry.format(company_name, args, keyword)
results = Google().search(qry, 10)
results = results.dropna()
results = Google()._google_df_to_linkedin_df(results)
_name = '(?i){0}'.format(company_name)
if " " in company_name:
results['company_score'] = [fuzz.partial_ratio(_name, company)
for company in results.company]
else:
results['company_score'] = [fuzz.ratio(_name, company)
for company in results.company]
if keyword != "":
results['score'] = [fuzz.ratio(keyword, title)
for title in results.title]
results = results[results.score > 75]
results = results[results.company_score > 64]
results = results.drop_duplicates()
data = {'data': results.to_dict('r'), 'company_name':company_name}
CompanyExtraInfoCrawl()._persist(data, "employees", "")
job = rq.get_current_job()
print job.meta.keys()
if "queue_name" in job.meta.keys():
if RQueue()._has_completed(job.meta["queue_name"]):
q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
return results