def _parse_response(self, html, company_name, keyword=None): results = Google()._results_html_to_df(html) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword: results['score'] = [ fuzz.partial_ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _employees(self, company_name="", keyword=None): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword: results['score'] = [ fuzz.partial_ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _employees(self, company_name="", keyword=""): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"' args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) results = Google().search(qry, 10) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company] if keyword != "": results['score'] = [fuzz.ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 64] results = results.drop_duplicates() data = {'data': results.to_dict('r'), 'company_name':company_name} CompanyExtraInfoCrawl()._persist(data, "employees", "") job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) return results
def _employees(self, company_name="", keyword=None): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company_name] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company_name] if keyword: results['score'] = [fuzz.partial_ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _employees(self, company_name="", keyword="", api_key=""): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword, api_key return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword != "": results['score'] = [ fuzz.ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 64] results = results.drop_duplicates() results["company_id"] = api_key """" data = {'data': results.to_dict('r'), 'company_name':company_name} CompanyExtraInfoCrawl()._persist(data, "employees", "") job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) """ return results
def _parse_response(self, html, company_name, keyword=None): results = Google()._results_html_to_df(html) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company_name] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company_name] if keyword: results['score'] = [fuzz.partial_ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results