def _parse_response(self, html, company_name, keyword=None): results = Google()._results_html_to_df(html) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword: results['score'] = [ fuzz.partial_ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _employees(self, company_name="", keyword=""): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"' args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) results = Google().search(qry, 10) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company] if keyword != "": results['score'] = [fuzz.ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 64] results = results.drop_duplicates() data = {'data': results.to_dict('r'), 'company_name':company_name} CompanyExtraInfoCrawl()._persist(data, "employees", "") job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) return results
def _employees(self, company_name="", keyword=None): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword: results['score'] = [ fuzz.partial_ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _related(self, domain, api_key="", name=""): companies = Google().search("related:{0}".format(domain), 10) companies = companies.drop_duplicates() companies.columns = ['link','description','title','lol','lmao'] data = {'data':companies.to_dict('r'),"domain":domain,"company_name":name} data["api_key"] = api_key CompanyExtraInfoCrawl()._persist(data, "similar", api_key)
def _employees(self, company_name="", keyword=None): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company_name] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company_name] if keyword: results['score'] = [fuzz.partial_ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _related(self, domain, api_key="", name=""): companies = Google().search("related:{0}".format(domain), 10) companies = companies.drop_duplicates() companies.columns = ['link', 'description', 'title', 'lol', 'lmao'] data = { 'data': companies.to_dict('r'), "domain": domain, "company_name": name } data["api_key"] = api_key CompanyExtraInfoCrawl()._persist(data, "similar", api_key)
def _employees(self, company_name="", keyword="", api_key=""): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword, api_key return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword != "": results['score'] = [ fuzz.ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 64] results = results.drop_duplicates() results["company_id"] = api_key """" data = {'data': results.to_dict('r'), 'company_name':company_name} CompanyExtraInfoCrawl()._persist(data, "employees", "") job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) """ return results
def _press_releases(self, qry, company_domain=None, period=None): queries = [ '"{0}" site:prnewswire.com'.format(qry), '"{0}" site:businesswire.com'.format(qry), '"{0}" site:marketwired.com'.format(qry), '"{0}" site:newswire.ca'.format(qry), '"{0}" site:reuters.com'.format(qry) ] p = Google()._multi_get(queries) try: p = p.drop_duplicates() except: """ """ #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span] p['description'] = [ "".join(span.split('...')[1:]).strip() for span in p.link_span ] p["domain"] = company_domain p['date'] = [span.split('...')[0].strip() for span in p.link_span] p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date] p['title'] = p['link_text'] p = p.drop('link_text', 1) p = p.drop('url', 1) p = p.drop('link_span', 1) #for i in p.timestamp: print i data = p data["domain"] = company_domain data["domain"] = company_domain data["event_type"] = "CompanyPressEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = [ "".join(map(str, _data.values()))[:124] for _data in data.to_dict("r") ] _df = data.to_dict("r") for i in _df: for key in i.keys(): if i[key] == None: del i[key] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data
def _parse_response(self, html, company_name, keyword=None): results = Google()._results_html_to_df(html) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company_name] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company_name] if keyword: results['score'] = [fuzz.partial_ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _press_releases(self, qry, company_domain=None, period=None): queries = ['"{0}" site:prnewswire.com'.format(qry), '"{0}" site:businesswire.com'.format(qry), '"{0}" site:marketwired.com'.format(qry), '"{0}" site:newswire.ca'.format(qry), '"{0}" site:reuters.com'.format(qry)] p = Google()._multi_get(queries) try: p = p.drop_duplicates() except: """ """ #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span] p['description'] = ["".join(span.split('...')[1:]).strip() for span in p.link_span] p["domain"] = company_domain p['date'] = [span.split('...')[0].strip() for span in p.link_span] p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date] p['title'] = p['link_text'] p = p.drop('link_text',1) p = p.drop('url',1) p = p.drop('link_span',1) #for i in p.timestamp: print i data = p data["domain"] = company_domain data["domain"] = company_domain data["event_type"] = "CompanyPressEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = ["".join(map(str, _data.values()))[:124] for _data in data.to_dict("r")] _df = data.to_dict("r") for i in _df: for key in i.keys(): if i[key] == None: del i[key] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data