def profile_timeline(profile_id): conn = rethink_conn.conn() t = r.table("triggers").filter({"profile": profile_id}) t = t.without([ "company_domain_research_completed", "employee_search_completed", "emailhunter_search_completed" ]) t = t.run(conn) t = pd.DataFrame(list(t)) t.index = [arrow.get(i).datetime for i in t.timestamp.fillna(0)] e = pd.DataFrame(list(r.table("company_employees").run(conn))) tstamps = map(str, t.index.to_period("D").unique()) dtimes = [ arrow.get(i).timestamp for i in t.index.to_period("D").to_timestamp().unique() ] keys = [t[i].company_key.unique().tolist() for i in tstamps] cos = [t[t.company_key.isin(key)].fillna("").to_dict("r") for key in keys] emps = [e[e.company_id.isin(key)].fillna("").to_dict("r") for key in keys] final = [{ "timestamp": dtimes[i], "cos": cos[i], "emps": emps[i] } for i in range(len(keys))] return make_response(json.dumps(final))
def _update_record(self, domain, _id): print "EMAIL HUNTER UPDATE RECORD" url = "http://api.emailhunter.co/v1/search?domain={0}&api_key=0191b3bdcf20b25b778da18bca995911cec0f630" conn = rethink_conn.conn() url = url.format(domain) # if emails found return html = requests.get(url).text ep = json.loads(html) print ep.keys() if "pattern" in ep.keys(): del ep["emails"] if ep["pattern"] == "none": ep["stache"] = None ep["pattern"] = None else: data = {"first": "{{first}}", "last": "{{last}}"} data["f"] = "{{first_initial}}" data["l"] = "{{last_initial}}" ep["stache"] = ep["pattern"].format(**data) ep["stache"] = ep["stache"] + "@{{domain}}" else: ep = None data = {"email_pattern": ep, "emailhunter_search_completed": r.now()} r.table('triggers').get(_id).update(data).run(conn)
def _update_company_record(self, domain, _id): start_time = time.time() print "UPDATE COMPANY RECORD" #conn = r.connect(host="localhost", port=28015, db="triggeriq") conn = rethink_conn.conn() print domain #company= [i for i in r.table('companies').filter({"domain":domain}).run(conn)] company = [] print "COMPANY FOUND" # TODO - wtf is result and why is it included if not company: company = clearbit.Company.find(domain=domain, stream=True) company = company if company else {} print company r.table('companies').insert(company).run(conn) result = "found" else: result = "not found" data = {"company_domain_research_completed":r.now(), "company_domain_research_result": result} r.table('triggers').get(_id).update(data).run(conn) bitmapist.mark_event("function:time:clearbit_search_company_record", int((time.time() - start_time)*10**6)) rd.zadd("function:time:clearbit_search_company_record", str((time.time() - start_time)*10**6), arrow.now().timestamp)
def _percentage_counts(self): conn = rethink_conn.conn() t = pd.DataFrame(list(r.table("triggers").run(conn))) e = pd.DataFrame(list(r.table("company_employees").run(conn))) have_employees = len(e.company_id.drop_duplicates()) have_domain = len(t.domain.dropna()) cdrc = "company_domain_research_completed" have_cb = len(t[cdrc].dropna()) have_ep = len(t.email_pattern.dropna()) t = pd.DataFrame(list(r.table("triggers").run(conn))) data = { "_all": t.shape[0], "have_employees": have_employees, "have_domain": have_domain, "have_cb": have_cb, "have_ep": have_ep, } data = data.values() _data = { "_all": t.shape[0], "have_employees": (float(have_employees) / t.shape[0]) * 100, "have_domain": (float(have_domain) / t.shape[0]) * 100, "have_cb": (float(have_cb) / t.shape[0]) * 100, "have_ep": (float(have_ep) / t.shape[0]) * 100, } _data = _data.values() p.trigger('admin_dashboard', "total_counts", data) p.trigger('admin_dashboard', "percentage_counts", _data)
def signup(): conn = rethink_conn.conn() json = request.get_json(force=True) email = json.get('username','') # Email validation if not re.match("^[^@ ]+@[^@ ]+\.[^@ ]+$", email): return jsonify({'status': 'Not a valid email'}) already_taken = len(list(r.table("users").filter( r.row["email"] == email).run(conn))) > 0 if already_taken: return jsonify({'status': 'Already Taken'}) password = b"%s" % json.get('password', '') # Create user token = _auth.create_user(email, password.encode('utf-8')) # Create Redis Token And Current Plan # TODO # days left till end of trial # end of current plan # billing cycle # return token return jsonify({'status': 'OK', 'token': token})
def _percentage_counts(self): conn = rethink_conn.conn() t = pd.DataFrame(list(r.table("triggers").run(conn))) e = pd.DataFrame(list(r.table("company_employees").run(conn))) have_employees = len(e.company_id.drop_duplicates()) have_domain = len(t.domain.dropna()) cdrc = "company_domain_research_completed" have_cb = len(t[cdrc].dropna()) have_ep = len(t.email_pattern.dropna()) t = pd.DataFrame(list(r.table("triggers").run(conn))) data = { "_all": t.shape[0], "have_employees": have_employees, "have_domain": have_domain, "have_cb": have_cb, "have_ep": have_ep, } data = data.values() _data = { "_all": t.shape[0], "have_employees": (float(have_employees) / t.shape[0])*100, "have_domain": (float(have_domain) / t.shape[0] )*100, "have_cb": (float(have_cb) / t.shape[0])*100, "have_ep": (float(have_ep) / t.shape[0])*100, } _data = _data.values() p.trigger('admin_dashboard', "total_counts", data) p.trigger('admin_dashboard', "percentage_counts", _data)
def _cron(self, role, locale, profile, country=None): ''' Get Todays Jobs For Job Posting ''' page = 0 #companies = self._indeed_page(role, locale, page, country) start_time = time.time() indeed_results = self._search(role, page, locale, country) companies = self._search_results_html_to_df(indeed_results) #print companies companies = companies[companies.date.str.contains("hour|minute|Just")] companies = self._date_phrase_to_timestamp(companies) companies = companies.drop_duplicates('company_name') companies["source"] = "Indeed" companies["keyword"] = role companies["profile"] = profile keys = [row.company_name.lower().replace(" ","")+"_"+profile for i, row in companies.iterrows()] companies["company_key"] = keys companies["createdAt"] = arrow.now().timestamp conn = rethink_conn.conn() #r.table("hiring_signals").insert(companies.to_dict('r')).run(conn) r.table("triggers").insert(companies.to_dict('r')).run(conn) print "function:time:indeed_job_scrape", str((time.time() - start_time)*10**6), arrow.now().timestamp rd.zadd("function:time:indeed_job_scrape", #"{0}:{1}".format(profile, str((time.time() - start_time)*10**6)), str((time.time() - start_time)*10**6), arrow.now().timestamp)
def _signal(self, qry, locale, profile, country=None): page = 1 start_time = time.time() print "Simply Hired" html = self._html(qry, page, locale, country) listings = self._listings(html) #print listings if listings.empty: return "none found" while 'day' not in listings.date.tolist()[-1]: page = page + 1 html = self._html(qry, page, locale, country) listings = listings.append(self._listings(html)) print page listings = listings[~listings.date.str.contains('day')] listings["keyword"] = qry listings = listings.drop_duplicates('company_name') listings['source'] = 'Simply Hired' listings["profile"] = profile #print listings companies = listings keys = [row.company_name.lower().replace(" ","")+"_"+profile for i, row in companies.iterrows()] companies["company_key"] = keys companies["createdAt"] = arrow.now().timestamp conn = rethink_conn.conn() #r.table("hiring_signals").insert(companies.to_dict('r')).run(conn) r.table("triggers").insert(companies.to_dict('r')).run(conn) bitmapist.mark_event("function:time:simplyhired_job_scrape", int((time.time() - start_time)*10**6)) rd.zadd("function:time:simplyhired_job_scrape", str((time.time() - start_time)*10**6), arrow.now().timestamp)
def signup(): conn = rethink_conn.conn() json = request.get_json(force=True) email = json.get('username', '') # Email validation if not re.match("^[^@ ]+@[^@ ]+\.[^@ ]+$", email): return jsonify({'status': 'Not a valid email'}) already_taken = len( list(r.table("users").filter(r.row["email"] == email).run(conn))) > 0 if already_taken: return jsonify({'status': 'Already Taken'}) password = b"%s" % json.get('password', '') # Create user token = _auth.create_user(email, password.encode('utf-8')) # Create Redis Token And Current Plan # TODO # days left till end of trial # end of current plan # billing cycle # return token return jsonify({'status': 'OK', 'token': token})
def _update_record(self, domain, _id): print "EMAIL HUNTER UPDATE RECORD" url = "http://api.emailhunter.co/v1/search?domain={0}&api_key=0191b3bdcf20b25b778da18bca995911cec0f630" conn = rethink_conn.conn() url = url.format(domain) # if emails found return html = requests.get(url).text ep = json.loads(html) print ep.keys() if "pattern" in ep.keys(): del ep["emails"] if ep["pattern"] == "none": ep["stache"] = None ep["pattern"] = None else: data = {"first":"{{first}}","last":"{{last}}"} data["f"] = "{{first_initial}}" data["l"] = "{{last_initial}}" ep["stache"] = ep["pattern"].format(**data) ep["stache"] = ep["stache"]+"@{{domain}}" else: ep = None data = {"email_pattern":ep, "emailhunter_search_completed": r.now()} r.table('triggers').get(_id).update(data).run(conn)
def _signal(self, qry, locale, profile, country=None): start_time = time.time() html = self._html(qry, 1, locale, country) listings = self._listings(html) last_page = html.find('ul',{'class':'paginationNumbers'}) last_page = last_page.find_all('li') if last_page else None last_page = int(last_page[-1].text.strip()) if last_page else 1 conn = rethink_conn.conn() for page in range(last_page): html = self._html(qry, page, locale, country) #listings = listings.append(self._listings(html)) listings = self._listings(html) listings['source'] = 'Zip Recruiter' listings["keyword"] = qry listings["profile"] = profile companies = listings keys = [row.company_name.lower().replace(" ","")+"_"+profile for i, row in companies.iterrows()] companies["company_key"] = keys companies["createdAt"] = arrow.now().timestamp #r.table("hiring_signals").insert(companies.to_dict('r')).run(conn) r.table("triggers").insert(companies.to_dict('r')).run(conn) bitmapist.mark_event("function:time:ziprecruiter_job_scrape", int((time.time() - start_time)*10**6)) #HiringSignal()._persist(listings, profile, report) rd.zadd("function:time:ziprecruiter_job_scrape", str((time.time() - start_time)*10**6), arrow.now().timestamp)
def profile_counts(profile_id): #TODO - add filter profile_id filter conn = rethink_conn.conn() t = pd.DataFrame(list(r.table("triggers").filter({"profile":profile_id}).run(conn))) e = pd.DataFrame(list(r.table("company_employees").run(conn))) ee = len(e[e.company_id.isin(t.company_key.unique())]) data = {"count":t.shape[0], "employee_count":ee} return make_response(json.dumps(data))
def company_info(domain): conn = rethink_conn.conn() data = r.table("companies").filter({"domain":domain}).run(conn) try: data = list(data)[0] except: data = {} return make_response(json.dumps(data))
def company_employees(_id): # TODO - create company employee keys print _id # current_identity conn = rethink_conn.conn() qry = {"company_id":_id} data = r.table("company_employees").filter(qry).coerce_to("array").run(conn) return make_response(json.dumps(data))
def company_info(domain): conn = rethink_conn.conn() data = r.table("companies").filter({"domain": domain}).run(conn) try: data = list(data)[0] except: data = {} return make_response(json.dumps(data))
def company_research(): conn = rethink_conn.conn() triggers = r.table("triggers").coerce_to("array").run(conn) for val in triggers: if "domain" not in val.keys(): continue print val["domain"] dq.enqueue(ClearbitSearch()._update_company_record, val["domain"], val["company_key"]) return make_response(json.dumps({"started": True}))
def profile_companies(profile_id): conn = rethink_conn.conn() data = r.table("triggers").filter(lambda trigger: trigger.has_fields("domain")) data = data.filter({"profile":profile_id}) data = data.order_by("timestamp") data = data.without(["company_domain_research_completed","employee_search_completed","emailhunter_search_completed"]) data = data.eq_join("profile", r.table("prospect_profiles")).zip() data = data.run(conn) return make_response(json.dumps(data))
def company_employees(_id): # TODO - create company employee keys print _id # current_identity conn = rethink_conn.conn() qry = {"company_id": _id} data = r.table("company_employees").filter(qry).coerce_to("array").run( conn) return make_response(json.dumps(data))
def _update_employee_record(self, company_name, _id, keyword=None, profile_id=None): #conn = r.connect(host="localhost", port=28015, db="triggeriq") conn = rethink_conn.conn() _profile = r.table("prospect_profiles").get(profile_id).run(conn) if "titles" not in _profile.keys(): _profile["titles"] = [None] if len(_profile["titles"]) == 0: profile["titles"] = [None] for title in _profile["titles"]: args = [company_name, _id, title, profile_id] q.enqueue(GoogleEmployeeSearch()._get_employee_record, *args)
def company_research(): conn = rethink_conn.conn() triggers = r.table("triggers").coerce_to("array").run(conn) for val in triggers: if "domain" not in val.keys(): continue print val["domain"] dq.enqueue(ClearbitSearch()._update_company_record, val["domain"], val["company_key"]) return make_response(json.dumps({"started":True}))
def _cron(self): conn = rethink_conn.conn() #t = pd.DataFrame(list(r.table("triggers").run(conn))) #print t.empty #if t.empty: return self._profile_value_counts() #self._rq_job_counts() self._average_value_counts() self._percentage_counts()
def identity(self, payload): conn = rethink_conn.conn() print payload cursor = r.table("users").filter(r.row["id"] == payload['identity']).run(conn) try: user = cursor.next() # return User(id=user['id'], username=user['email']) print user return User(user['id'], user['email'], user["password"]) except r.ReqlCursorEmpty: return None
def profile_counts(profile_id): #TODO - add filter profile_id filter conn = rethink_conn.conn() t = pd.DataFrame( list(r.table("triggers").filter({ "profile": profile_id }).run(conn))) e = pd.DataFrame(list(r.table("company_employees").run(conn))) ee = len(e[e.company_id.isin(t.company_key.unique())]) data = {"count": t.shape[0], "employee_count": ee} return make_response(json.dumps(data))
def trigger_research(): conn = rethink_conn.conn() triggers = r.table("triggers").coerce_to("array").run(conn) for val in triggers: if "domain" not in val.keys(): continue print val["domain"] dq.enqueue(CompanyNameToDomain()._update_company_record, val["company_name"], val["company_key"]) dq.enqueue(GoogleEmployeeSearch()._update_employee_record, val["company_name"], "", val["company_key"]) return make_response(json.dumps({"started":True}))
def identity(self, payload): conn = rethink_conn.conn() print payload cursor = r.table("users").filter( r.row["id"] == payload['identity']).run(conn) try: user = cursor.next() # return User(id=user['id'], username=user['email']) print user return User(user['id'], user['email'], user["password"]) except r.ReqlCursorEmpty: return None
def trigger_research(): conn = rethink_conn.conn() triggers = r.table("triggers").coerce_to("array").run(conn) for val in triggers: if "domain" not in val.keys(): continue print val["domain"] dq.enqueue(CompanyNameToDomain()._update_company_record, val["company_name"], val["company_key"]) dq.enqueue(GoogleEmployeeSearch()._update_employee_record, val["company_name"], "", val["company_key"]) return make_response(json.dumps({"started": True}))
def profile_companies(profile_id): conn = rethink_conn.conn() data = r.table("triggers").filter( lambda trigger: trigger.has_fields("domain")) data = data.filter({"profile": profile_id}) data = data.order_by("timestamp") data = data.without([ "company_domain_research_completed", "employee_search_completed", "emailhunter_search_completed" ]) data = data.eq_join("profile", r.table("prospect_profiles")).zip() data = data.run(conn) return make_response(json.dumps(data))
def profile_triggers(profile_id, page=0): #TODO - add filter profile_id filter page = int(page) conn = rethink_conn.conn() data = r.table("triggers").filter(lambda trigger: trigger.has_fields("domain")) data = data.filter({"profile":profile_id}) data = data.order_by(r.desc("timestamp")) data = data.without(["company_domain_research_completed","employee_search_completed","emailhunter_search_completed"]) data = data.eq_join("profile", r.table("prospect_profiles")).zip() data = data.slice(page*50, (page+1)*50).limit(50) data = list(data.run(conn)) return make_response(json.dumps(data))
def _cron(self): conn = rethink_conn.conn() profiles = list(r.table("prospect_profiles").run(conn)) for profile in profiles: _profile = [i["className"] for i in profile["profiles"]] #print _profile if 'HiringProfile' in _profile: dq.enqueue(Signals()._hiring, profile, timeout=6000) elif 'PressProfile' in _profile: dq.enqueue(Press()._daily_collect, profile, timeout=6000) elif 'IndustryPressProfile' in _profile: dq.enqueue(Press()._daily_industry_collect, profile, timeout=6000) elif 'TwitterProfile' in _profile: dq.enqueue(Twitter()._daily_collect, profile.objectId, timeout=6000)
def triggers(page=0): # TODO paginate #data = r.table("triggers").limit(50).coerce_to("array").run(conn) print page page = int(page) conn = rethink_conn.conn() #.distinct(index="domain") data = r.table("triggers").filter(lambda trigger: trigger.has_fields("domain")) #data = data.order_by("created_at") data = data.order_by(r.desc("timestamp")) data = data.without(["company_domain_research_completed","employee_search_completed","emailhunter_search_completed"]) data = data.eq_join("profile", r.table("prospect_profiles")) data = data.slice(page*50, (page+1)*50).limit(50).zip() #data = data.without([]) data = list(data.run(conn)) return make_response(json.dumps(data))
def authenticate(self, username, password): conn = rethink_conn.conn() username, password = username.encode('utf-8'), password.encode('utf-8') cursor = r.table("users").filter(r.row["email"] == username).run(conn) try: user = cursor.next() if not user: return None email = user['email'] hashed_pass = user['password'] if username == email and hashed_pass == bcrypt.hashpw(password.encode('utf-8'), hashed_pass.encode('utf-8')): # return User(id=user['id'], username=email) return User(user['id'], email, hashed_pass) except r.ReqlCursorEmpty: return None
def create_user(self, username, password): # Hash password conn = rethink_conn.conn() hashed_pass = bcrypt.hashpw(password, bcrypt.gensalt(8)) user = {} user['email'] = username user['password'] = hashed_pass created = r.table("users").insert(user).run(conn) assert created['inserted'] == 1 # Generate token user_id = created['generated_keys'][0] user = User(user_id, username, hashed_pass) return self.jwt.jwt_encode_callback(user)
def profile_timeline(profile_id): conn = rethink_conn.conn() t = r.table("triggers").filter({"profile":profile_id}) t = t.without(["company_domain_research_completed","employee_search_completed","emailhunter_search_completed"]) t = t.run(conn) t = pd.DataFrame(list(t)) t.index = [arrow.get(i).datetime for i in t.timestamp.fillna(0)] e = pd.DataFrame(list(r.table("company_employees").run(conn))) tstamps = map(str, t.index.to_period("D").unique()) dtimes = [arrow.get(i).timestamp for i in t.index.to_period("D").to_timestamp().unique()] keys = [t[i].company_key.unique().tolist() for i in tstamps] cos = [t[t.company_key.isin(key)].fillna("").to_dict("r") for key in keys] emps = [e[e.company_id.isin(key)].fillna("").to_dict("r") for key in keys] final = [{"timestamp":dtimes[i], "cos":cos[i],"emps":emps[i]} for i in range(len(keys))] return make_response(json.dumps(final))
def company_event_changes(): rethink_conn = yield r.connect(**rethink_conn.conn()) feed = yield r.table('company_events').changes().run(rethink_conn) while (yield feed.fetch_next()): change = yield feed.next() # get domain of company qry = {"domain":change["new_val"]["domain"]} users = r.table("user_contacts").filter(qry).run(conn) max_number_of_elements = 100 val = change["new_val"] for user in users: key = "user:#{id}".format(user) #redis.zadd(key, score, new_content.id) redis.zadd(key, val["timestamp"], val["id"]) redis.zremrangebyrank(key, max_number_of_elements, -1)
def _update_company_record(self, company_name, _id): print "UPDATE RECORD FOR COMPANY NAME" start_time = time.time() domain = self.get(company_name)[0]["domain"] conn = rethink_conn.conn() #r.table('hiring_signals').get(_id).update({"domain":domain}).run(conn) print _id, domain print r.table('triggers').filter({"company_key":_id}).update({"domain":domain}, return_changes=True).run(conn) """ bitmapist.mark_event("function:time:company_name_to_domain", int((time.time() - start_time)*10**6)) """ rd.zadd("function:time:company_name_to_domain", str((time.time() - start_time)*10**6), arrow.now().timestamp)
def authenticate(self, username, password): conn = rethink_conn.conn() username, password = username.encode('utf-8'), password.encode('utf-8') cursor = r.table("users").filter(r.row["email"] == username).run(conn) try: user = cursor.next() if not user: return None email = user['email'] hashed_pass = user['password'] if username == email and hashed_pass == bcrypt.hashpw( password.encode('utf-8'), hashed_pass.encode('utf-8')): # return User(id=user['id'], username=email) return User(user['id'], email, hashed_pass) except r.ReqlCursorEmpty: return None
def profile_triggers(profile_id, page=0): #TODO - add filter profile_id filter page = int(page) conn = rethink_conn.conn() data = r.table("triggers").filter( lambda trigger: trigger.has_fields("domain")) data = data.filter({"profile": profile_id}) data = data.order_by(r.desc("timestamp")) data = data.without([ "company_domain_research_completed", "employee_search_completed", "emailhunter_search_completed" ]) data = data.eq_join("profile", r.table("prospect_profiles")).zip() data = data.slice(page * 50, (page + 1) * 50).limit(50) data = list(data.run(conn)) return make_response(json.dumps(data))
def profile_employees(profile_id): conn = rethink_conn.conn() data = r.table("triggers").filter(lambda trigger: trigger.has_fields("domain")) data = data.filter({"profile":profile_id}) data = data.order_by("createdAt") #data = data.order_by("timestamp") data = data.without(["company_domain_research_completed","employee_search_completed","emailhunter_search_completed"]) data = data.eq_join("profile", r.table("prospect_profiles")).zip() d = list(data.run(conn)) e = pd.DataFrame(list(r.table("company_employees").run(conn))) ee = e[e.company_id.isin(pd.DataFrame(d).company_key.tolist())] emp = ee.columns.tolist() emp[0] = "company_key" ee.columns = emp em = pd.merge(ee, pd.DataFrame(d),on="company_key") return make_response(json.dumps(em.to_dict("r")))
def _get_employee_record(self, company_name, _id, keyword=None, profile_id=None): start_time = time.time() #conn = r.connect(host="localhost", port=28015, db="triggeriq") conn = rethink_conn.conn() res = self._employees(company_name, keyword) res["company_id"] = _id res["profile_id"] = profile_id print "EMPLOYEES FOUND", company_name, res.shape r.table('company_employees').insert(res.to_dict("r")).run(conn) epsc = "employee_search_completed" r.table("triggers").get(_id).update({epsc: r.now()}).run(conn) bitmapist.mark_event("function:time:company_employee_search", int((time.time() - start_time)*10**6)) rd.zadd("function:time:company_employee_search", str((time.time() - start_time)*10**6), arrow.now().timestamp)
def _request(self, url, key, value, press_event_id): domain = "{}.{}".format(tldextract.extract(url).domain, tldextract.extract(url).tld) feed = pd.DataFrame(feedparser.parse(url)["entries"]) #feed["subject"] = name feed[key] = value feed["press_event_id"] = value feed["source"] = domain.split(".")[0].lower() data = feed.applymap(lambda x: self._remove_non_ascii(x)) data["rss_url"] = url if "published_parsed" in data.columns: ar = [arrow.get(datetime.fromtimestamp(mktime(i))).timestamp for i in data.published_parsed] data["timestamp"] = ar del data["published_parsed"] #print data.ix[0] data = [row.dropna().to_dict() for i, row in data.iterrows()] conn = rethink_conn.conn() r.table("press_events").insert(data).run(conn)
def triggers(page=0): # TODO paginate #data = r.table("triggers").limit(50).coerce_to("array").run(conn) print page page = int(page) conn = rethink_conn.conn() #.distinct(index="domain") data = r.table("triggers").filter( lambda trigger: trigger.has_fields("domain")) #data = data.order_by("created_at") data = data.order_by(r.desc("timestamp")) data = data.without([ "company_domain_research_completed", "employee_search_completed", "emailhunter_search_completed" ]) data = data.eq_join("profile", r.table("prospect_profiles")) data = data.slice(page * 50, (page + 1) * 50).limit(50).zip() #data = data.without([]) data = list(data.run(conn)) return make_response(json.dumps(data))
def _update_company_record(self, company_name, _id): print "UPDATE RECORD FOR COMPANY NAME" start_time = time.time() domain = self.get(company_name)[0]["domain"] conn = rethink_conn.conn() #r.table('hiring_signals').get(_id).update({"domain":domain}).run(conn) print _id, domain print r.table('triggers').filter({ "company_key": _id }).update({ "domain": domain }, return_changes=True).run(conn) """ bitmapist.mark_event("function:time:company_name_to_domain", int((time.time() - start_time)*10**6)) """ rd.zadd("function:time:company_name_to_domain", str((time.time() - start_time) * 10**6), arrow.now().timestamp)
def profile_employees(profile_id): conn = rethink_conn.conn() data = r.table("triggers").filter( lambda trigger: trigger.has_fields("domain")) data = data.filter({"profile": profile_id}) data = data.order_by("createdAt") #data = data.order_by("timestamp") data = data.without([ "company_domain_research_completed", "employee_search_completed", "emailhunter_search_completed" ]) data = data.eq_join("profile", r.table("prospect_profiles")).zip() d = list(data.run(conn)) e = pd.DataFrame(list(r.table("company_employees").run(conn))) ee = e[e.company_id.isin(pd.DataFrame(d).company_key.tolist())] emp = ee.columns.tolist() emp[0] = "company_key" ee.columns = emp em = pd.merge(ee, pd.DataFrame(d), on="company_key") return make_response(json.dumps(em.to_dict("r")))
def company_name_to_domain_changes(): conn = yield rethink_conn.conn() #feed = yield r.table('hiring_signals').changes().run(rethink_conn) feed = yield r.table('company_domain_research').changes().run(conn) while (yield feed.fetch_next()): change = yield feed.next() qry = change["new_val"]["qry"] # TODO Score q = r.table('company_domain_research').filter({"qry":qry}) searches = yield q.coerce_to("array").run(conn) print pd.DataFrame(searches).search_engine, qry domains = CompanyNameToDomain().score(qry, [pd.DataFrame(i["res"]) for i in searches]) if domains: triggers = r.table("triggers").filter({"company_name":qry}) triggers = yield triggers.coerce_to("array").run(conn) print "triggers", len(triggers) for t in triggers: domain = domains[0]["domain"] t=r.table("triggers").get(t["company_key"]).update({"domain":domain}) yield t.run(conn) print domains
def _request(self, url, key, value, press_event_id): domain = "{}.{}".format( tldextract.extract(url).domain, tldextract.extract(url).tld) feed = pd.DataFrame(feedparser.parse(url)["entries"]) #feed["subject"] = name feed[key] = value feed["press_event_id"] = value feed["source"] = domain.split(".")[0].lower() data = feed.applymap(lambda x: self._remove_non_ascii(x)) data["rss_url"] = url if "published_parsed" in data.columns: ar = [ arrow.get(datetime.fromtimestamp(mktime(i))).timestamp for i in data.published_parsed ] data["timestamp"] = ar del data["published_parsed"] #print data.ix[0] data = [row.dropna().to_dict() for i, row in data.iterrows()] conn = rethink_conn.conn() r.table("press_events").insert(data).run(conn)
def _get_employee_record(self, company_name, _id, keyword=None, profile_id=None): start_time = time.time() #conn = r.connect(host="localhost", port=28015, db="triggeriq") conn = rethink_conn.conn() res = self._employees(company_name, keyword) res["company_id"] = _id res["profile_id"] = profile_id print "EMPLOYEES FOUND", company_name, res.shape r.table('company_employees').insert(res.to_dict("r")).run(conn) epsc = "employee_search_completed" r.table("triggers").get(_id).update({epsc: r.now()}).run(conn) bitmapist.mark_event("function:time:company_employee_search", int((time.time() - start_time) * 10**6)) rd.zadd("function:time:company_employee_search", str((time.time() - start_time) * 10**6), arrow.now().timestamp)
def company_name_to_domain_changes(): conn = yield rethink_conn.conn() #feed = yield r.table('hiring_signals').changes().run(rethink_conn) feed = yield r.table('company_domain_research').changes().run(conn) while (yield feed.fetch_next()): change = yield feed.next() qry = change["new_val"]["qry"] # TODO Score q = r.table('company_domain_research').filter({"qry": qry}) searches = yield q.coerce_to("array").run(conn) print pd.DataFrame(searches).search_engine, qry domains = CompanyNameToDomain().score( qry, [pd.DataFrame(i["res"]) for i in searches]) if domains: triggers = r.table("triggers").filter({"company_name": qry}) triggers = yield triggers.coerce_to("array").run(conn) print "triggers", len(triggers) for t in triggers: domain = domains[0]["domain"] t = r.table("triggers").get(t["company_key"]).update( {"domain": domain}) yield t.run(conn) print domains
def _signal(self, qry, locale, profile, country=None): page = 1 start_time = time.time() print "Simply Hired" html = self._html(qry, page, locale, country) listings = self._listings(html) #print listings if listings.empty: return "none found" while 'day' not in listings.date.tolist()[-1]: page = page + 1 html = self._html(qry, page, locale, country) listings = listings.append(self._listings(html)) print page listings = listings[~listings.date.str.contains('day')] listings["keyword"] = qry listings = listings.drop_duplicates('company_name') listings['source'] = 'Simply Hired' listings["profile"] = profile #print listings companies = listings keys = [ row.company_name.lower().replace(" ", "") + "_" + profile for i, row in companies.iterrows() ] companies["company_key"] = keys companies["createdAt"] = arrow.now().timestamp conn = rethink_conn.conn() #r.table("hiring_signals").insert(companies.to_dict('r')).run(conn) r.table("triggers").insert(companies.to_dict('r')).run(conn) bitmapist.mark_event("function:time:simplyhired_job_scrape", int((time.time() - start_time) * 10**6)) rd.zadd("function:time:simplyhired_job_scrape", str((time.time() - start_time) * 10**6), arrow.now().timestamp)
def _start(self): conn = rethink_conn.conn() rr = requests.get("http://www.prweb.com/rss.htm") prweb_vals, prweb_sub = {}, {} for i in BeautifulSoup(rr.text).find("table").find_all("tr"): if "Business: " in i.find("td").text: prweb_sub[i.find("td").text.split("Business: ")[-1]] = "http://www.prweb.com"+i.find_all("td")[1].find("a")["href"] if "Industry: " in i.find("td").text: prweb_vals[i.find("td").text.split("Industry: ")[-1]] = "http://www.prweb.com"+i.find_all("td")[1].find("a")["href"] #marketwired mw_vals, mw_sub = {}, {} rr = requests.get("http://www.marketwired.com/News_Room/rss_newsfeeds") for row in BeautifulSoup(rr.text).find_all("tr",{"class":"ByIndustry"}): mw_vals[row.find("a").text] = row.find_all("a")[1]["href"] for row in BeautifulSoup(rr.text).find_all("tr",{"class":"BySubject"}): name = row.find("td").text.strip() url = row.find_all("a")[1]["href"].split("rss?url=")[-1] url = urllib.unquote_plus(url) mw_sub[name.split("\t")[-1]] = url # prnewswire #rr = requests.get("http://www.prnewswire.com/rss/") rr = Crawlera().get("http://www.prnewswire.com/rss/") pnw_vals, pnw_sub = {}, {} found = False for row in BeautifulSoup(rr.text).find_all("tr"): if row.find("th"): found = "Industry" in row.find("th").text if found: if row.find("a"): link = row.find_all("button") for i in link: if i.find("i")["class"] == ["icon-rss"]: link = "http://prnewswire.com"+i["onclick"].split("'")[1] pnw_vals[row.find("a").text] = link.strip() # BusinessWire rr = requests.get("http://www.businesswire.com/portal/site/home/news/industries/") bw_vals, bw_sub = {}, {} for tr in BeautifulSoup(rr.text).find("table",{"id":"newsbyIndustry"}).find_all("tr"): link = tr.find("td",{"class":"rss"}).find("a")["href"] name = tr.find("a").text bw_vals[name] = link rr = requests.get("http://www.businesswire.com/portal/site/home/news/subjects/") for tr in BeautifulSoup(rr.text).find("table",{"id":"newsbySubject"}).find_all("tr"): link = tr.find("td",{"class":"rss"}).find("a")["href"] name = tr.find("a").text bw_sub[name] = link # cnw dict # Industry cnw_vals, cnw_sub = {}, {} """ r = requests.get("http://www.newswire.ca/en/rss") for row in BeautifulSoup(r.text).find("table").find_all("table")[-1].find_all("tr"): for a in row.find_all("a"): if "RSS" in a.text: cnw_vals[row.find_all("td")[1].text] = a["href"] r = requests.get("http://www.newswire.ca/en/rss") for row in BeautifulSoup(r.text).find("table").find_all("table")[-3].find_all("tr"): for a in row.find_all("a"): if "RSS" in a.text: cnw_sub[row.find_all("td")[1].text] = a["href"] """ # Merge ind = dict(bw_vals.items() + mw_vals.items() + pnw_vals.items() + prweb_vals.items()+ cnw_vals.items()) sub = dict(bw_sub.items() + mw_sub.items() + pnw_sub.items() + prweb_sub.items()+ cnw_sub.items()) print len(ind.keys()), len(sub.keys()) ind = pd.DataFrame(list(r.table("press_industries").run(conn))) ind = dict(zip(ind.industry, ind.id)) sub = pd.DataFrame(list(r.table("press_subjects").run(conn))) sub = dict(zip(sub.subject, sub.id)) for i in ind.keys(): pind = PressClassification()._industries() if i not in pind.keys(): continue print pind[i], ind[i] q.enqueue(PressScrape()._request, ind[i], "industry", pind[i], ind[pind[i]]) for i in sub.keys(): psub = PressClassification()._subjects() if i not in psub.keys(): continue print psub[i], sub[i] q.enqueue(PressScrape()._request, sub[i], "subject", psub[i], sub[psub[i]])
def _start(self): conn = rethink_conn.conn() rr = requests.get("http://www.prweb.com/rss.htm") prweb_vals, prweb_sub = {}, {} for i in BeautifulSoup(rr.text).find("table").find_all("tr"): if "Business: " in i.find("td").text: prweb_sub[i.find("td").text.split("Business: ") [-1]] = "http://www.prweb.com" + i.find_all( "td")[1].find("a")["href"] if "Industry: " in i.find("td").text: prweb_vals[i.find("td").text.split("Industry: ") [-1]] = "http://www.prweb.com" + i.find_all( "td")[1].find("a")["href"] #marketwired mw_vals, mw_sub = {}, {} rr = requests.get("http://www.marketwired.com/News_Room/rss_newsfeeds") for row in BeautifulSoup(rr.text).find_all("tr", {"class": "ByIndustry"}): mw_vals[row.find("a").text] = row.find_all("a")[1]["href"] for row in BeautifulSoup(rr.text).find_all("tr", {"class": "BySubject"}): name = row.find("td").text.strip() url = row.find_all("a")[1]["href"].split("rss?url=")[-1] url = urllib.unquote_plus(url) mw_sub[name.split("\t")[-1]] = url # prnewswire #rr = requests.get("http://www.prnewswire.com/rss/") rr = Crawlera().get("http://www.prnewswire.com/rss/") pnw_vals, pnw_sub = {}, {} found = False for row in BeautifulSoup(rr.text).find_all("tr"): if row.find("th"): found = "Industry" in row.find("th").text if found: if row.find("a"): link = row.find_all("button") for i in link: if i.find("i")["class"] == ["icon-rss"]: link = "http://prnewswire.com" + i[ "onclick"].split("'")[1] pnw_vals[row.find("a").text] = link.strip() # BusinessWire rr = requests.get( "http://www.businesswire.com/portal/site/home/news/industries/") bw_vals, bw_sub = {}, {} for tr in BeautifulSoup(rr.text).find("table", { "id": "newsbyIndustry" }).find_all("tr"): link = tr.find("td", {"class": "rss"}).find("a")["href"] name = tr.find("a").text bw_vals[name] = link rr = requests.get( "http://www.businesswire.com/portal/site/home/news/subjects/") for tr in BeautifulSoup(rr.text).find("table", { "id": "newsbySubject" }).find_all("tr"): link = tr.find("td", {"class": "rss"}).find("a")["href"] name = tr.find("a").text bw_sub[name] = link # cnw dict # Industry cnw_vals, cnw_sub = {}, {} """ r = requests.get("http://www.newswire.ca/en/rss") for row in BeautifulSoup(r.text).find("table").find_all("table")[-1].find_all("tr"): for a in row.find_all("a"): if "RSS" in a.text: cnw_vals[row.find_all("td")[1].text] = a["href"] r = requests.get("http://www.newswire.ca/en/rss") for row in BeautifulSoup(r.text).find("table").find_all("table")[-3].find_all("tr"): for a in row.find_all("a"): if "RSS" in a.text: cnw_sub[row.find_all("td")[1].text] = a["href"] """ # Merge ind = dict(bw_vals.items() + mw_vals.items() + pnw_vals.items() + prweb_vals.items() + cnw_vals.items()) sub = dict(bw_sub.items() + mw_sub.items() + pnw_sub.items() + prweb_sub.items() + cnw_sub.items()) print len(ind.keys()), len(sub.keys()) ind = pd.DataFrame(list(r.table("press_industries").run(conn))) ind = dict(zip(ind.industry, ind.id)) sub = pd.DataFrame(list(r.table("press_subjects").run(conn))) sub = dict(zip(sub.subject, sub.id)) for i in ind.keys(): pind = PressClassification()._industries() if i not in pind.keys(): continue print pind[i], ind[i] q.enqueue(PressScrape()._request, ind[i], "industry", pind[i], ind[pind[i]]) for i in sub.keys(): psub = PressClassification()._subjects() if i not in psub.keys(): continue print psub[i], sub[i] q.enqueue(PressScrape()._request, sub[i], "subject", psub[i], sub[psub[i]])
def get(self, company_name): dd, g, yd, bg = self._search_engine_search(company_name) """ dd = DuckDuckGo().search(company_name) g = Google().search(company_name) yd = Yandex().search(company_name) bg = Bing().search(company_name) """ print "SEARCH ENGINE RESULTS", company_name print "====================" print g.shape, company_name print bg.shape, company_name print dd.shape, company_name print yd.shape, company_name if g.empty: g = pd.DataFrame(columns=["link","domain"]) if yd.empty: yd = pd.DataFrame(columns=["link","domain"]) if bg.empty: bg = pd.DataFrame(columns=["link","domain"]) if dd.empty: dd = pd.DataFrame(columns=["link","domain"]) #m = pd.concat([dd.ix[:10],g.ix[:10],yd.ix[:10],bg.ix[:10]]) g["domain"] = [".".join(urlparse.urlparse(i).netloc.split(".")[-2:]) for i in g.link] yd["domain"] = [".".join(urlparse.urlparse(i).netloc.split(".")[-2:]) if i else "" for i in yd.link] bg["domain"] = [".".join(urlparse.urlparse(i).netloc.split(".")[-2:]) for i in bg.link] dd["domain"] = [".".join(urlparse.urlparse(i).netloc.split(".")[-2:]) if i else "" for i in dd.link] m = pd.concat([g.ix[:10].drop_duplicates("domain"), yd.ix[:10].drop_duplicates("domain"), dd.ix[:10].drop_duplicates("domain"), bg.ix[:10].drop_duplicates("domain")]) m = m.reset_index() m["domain"] = [".".join(urlparse.urlparse(i).netloc.split(".")[-2:]) for i in m.link] m = m[m.domain != ""] #print m[["link","domain"]] # Scoring based on frequency and rank of domain in all search engines a, b = m.domain.value_counts().ix[:10], m.groupby("domain").sum().sort("index").ix[:10] a, b = a.iloc[::-1], b.iloc[::-1] a,b = a.reset_index(), b.reset_index() a,b = a.reset_index(), b.reset_index() a.columns, b.columns = ["new_score","domain","old_score"], ["new_score","domain","old_score"] f = pd.concat([a,b]).groupby("domain").sum() f = f.sort("new_score", ascending=False) f["confidence"] = f.new_score / 8.0 *100 full_domains = f.reset_index() domains = f[:3].reset_index() domains["fuzz_score"] = [fuzz.ratio(company_name, i) for i in domains.domain] domains = domains.sort("fuzz_score", ascending=False) domains = domains.to_dict("r") data = {"company_name": company_name, "short_list": domains, "long_list":full_domains.to_dict("r"), #TODO search engine results "google_results": g.shape, "bing_results": bg.shape, "yandex_results": yd.shape, "duckduckgo_results": dd.shape, "createdAt":arrow.now().timestamp, "selected": domains[0]["domain"]} conn = rethink_conn.conn() r.table("company_name_to_domain").insert(data).run(conn) try: print domains[0]["domain"] return domains except: return None
import tornado.httpclient import rethinkdb as r import rethink_conn from apscheduler.schedulers.tornado import TornadoScheduler from tornado import ioloop, gen from tornado.concurrent import Future, chain_future import functools from company_name_to_domain import CompanyNameToDomain import pandas as pd r.set_loop_type("tornado") conn_future = rethink_conn.conn() http_client = tornado.httpclient.AsyncHTTPClient() class AsyncCompanyNameResearch: @tornado.gen.coroutine def handle_response(self, response): conn = yield conn_future if response.code != 200: return http_client.fetch(response.effective_url, AsyncCompanyNameResearch().handle_response) df = CompanyNameToDomain()._persist(response) #print response.effective_url, df #print df["search_engine"], df["qry"] yield r.table('company_domain_research').insert(df).run(conn) # persist to rethinkdb @tornado.gen.coroutine def start(self):
""" import rethinkdb as r import bcrypt from flask import current_app, request, jsonify, Blueprint from flask_jwt import JWT, jwt_required, current_identity from flask import Flask, send_from_directory, render_template, make_response, request from datetime import datetime, timedelta import re import rethink_conn import json from werkzeug.local import LocalProxy #conn = r.connect(host="localhost", port=28015, db="triggeriq") conn = rethink_conn.conn() auth = Blueprint('auth', __name__) _auth = LocalProxy(lambda: current_app.extensions['auth']) #TODO # - Authentication ''' Register: Login: POST to /signup POST to /login Content-Type: application/json { "username": user, "password": pass }