def _domain_search(self, domain, api_key="", name=""): qry = 'site:zoominfo.com/c/ {0}'.format(domain) df = Google().search(qry) if df.empty: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, "zoominfo", api_key) df['_name'] = [ i.split("Company Profile")[0].strip() for i in df.link_text ] df["score"] = [fuzz.ratio(b, name) for b in df._name] df = df[df.score > 70] df = df.sort('score', ascending=False) if df.empty: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, "zoominfo", api_key) df = df.reset_index().drop('index', 1) url = df.ix[0].link print "ZOOMINFO URL", url html = Google().cache(url) html = requests.get(url).text html = self._remove_non_ascii(html) zoominfo = self._cache_html_to_df(html) zoominfo['company_name'] = name zoominfo['handle'] = url zoominfo["domain_search"] = True zoominfo["domain"] = domain print zoominfo CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
def _url_to_dict(self, name, url, api_key=""): html = Google().cache(url) info = self._company_cache_html_to_df(html) if type(info) is str: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, 'linkedin', api_key) info = json.loads(info.ix[0].to_json()) info['company_name'] = name info['handle'] = url info["domain_search"] = True CompanyInfoCrawl()._persist(info, 'linkedin', api_key) return info
def _signal(self, link, api_key=""): html = Google().cache(link) info = self._html_to_dict(html) tweets = self._tweets(html) CompanyInfoCrawl()._persist(info, "twitter", api_key) for tweet in tweets: CompanyExtraInfoCrawl()._persist(tweet, "tweets", api_key)
def _signal(self, link, api_key=""): html = Google().cache(link) info = self._html_to_dict(html) posts = self._posts(html) CompanyInfoCrawl()._persist(info, "facebook", api_key) for post in posts: CompanyExtraInfoCrawl()._persist(post, "facebook_posts", api_key)
def _company_profile(self, company_name, api_key="", domain=""): qry = 'site:zoominfo.com/c/ {0}'.format(company_name) google_df = Google().search(qry) data = {'company_name': company_name, "domain": domain} if google_df.empty: return CompanyInfoCrawl()._persist(data, "zoominfo", api_key) url = google_df.ix[0].link print "ZOOMINFO URL", url html = Google().ec2_cache(url) html = requests.get(url).text html = self._remove_non_ascii(html) zoominfo = self._cache_html_to_df(html) zoominfo['company_name'] = company_name zoominfo['domain'] = domain zoominfo['handle'] = url print zoominfo CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
def _company_profile(self, company_name, api_key): qry = company_name + ' site:linkedin.com/company' google_results = Google().search(qry) if google_results.empty: return CompanyInfoCrawl()._persist({'company_name': company_name}, 'linkedin', api_key) url = google_results.ix[0].url html = Google().cache(url) info = self._company_cache_html_to_df(html) if type(info) is str: return CompanyInfoCrawl()._persist({'company_name': company_name}, 'linkedin', api_key) info = json.loads(info.ix[0].to_json()) info['company_name'] = company_name info['handle'] = url CompanyInfoCrawl()._persist(info, 'linkedin', api_key) return info
def _company_profile(self, name, api_key=""): df = Google().search('site:twitter.com {0}'.format(name)) if df.empty: return df url = df.link.tolist()[0] html = requests.get(url).text val = self._html_to_dict(html) val["company_name"] = name CompanyInfoCrawl()._persist(val, "twitter", api_key)
def _url_to_dict(self, company_name, url, api_key=""): html = Google().cache(url) #val = self._html_to_dict(html) zoominfo = self._cache_html_to_df(html) zoominfo['company_name'] = company_name zoominfo['domain'] = domain zoominfo['handle'] = url print zoominfo CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
def _domain_search(self, domain, api_key="", name=""): qry = 'site:linkedin.com/company {0}'.format(domain) google_results = Google().search(qry) if google_results.empty: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, 'linkedin', api_key) url = google_results.ix[0].url html = Google().cache(url) info = self._company_cache_html_to_df(html) if type(info) is str: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, 'linkedin', api_key) info = json.loads(info.ix[0].to_json()) info['company_name'] = name info['handle'] = url info["domain_search"] = True if info["domain"] == domain: CompanyInfoCrawl()._persist(info, 'linkedin', api_key) return info
def _company_profile(self, name, api_key=""): df = Google().search('site:facebook.com {0}'.format(name)) if df.empty: return df url = df.link.tolist()[0] html = Google().cache(url) #browser = Browser('phantomjs') #browser.visit(url) val = self._html_to_dict(html) print val val["company_name"] = name CompanyInfoCrawl()._persist(val, "facebook", api_key)
def _domain_search(self, domain, api_key="", name=""): df = Google().search('site:twitter.com {0}'.format(domain)) for url in df.link: r = requests.get(url).text link = BeautifulSoup(r).find( 'span', {'class': 'ProfileHeaderCard-urlText'}) link = link.text.strip() if link else "" if domain not in link: continue val = self._html_to_dict(r) break val["company_name"] = name val["domain"] = domain CompanyInfoCrawl()._persist(val, "twitter", api_key)
def _domain_search(self, domain, api_key="", name=""): df = Google().search('site:facebook.com {0}'.format(domain)) for url in df.link: #browser = Browser('phantomjs') #browser.visit(url) # html = browser.html html = Google().cache(url) if domain not in BeautifulSoup(html).text: continue val = self._html_to_dict(html) val["company_name"] = name val["domain"] = domain CompanyInfoCrawl()._persist(val, "facebook", api_key) break
def _url_to_dict(self, name, url, api_key=""): html = Google().cache(url) val = self._html_to_dict(html) print val val["company_name"] = name CompanyInfoCrawl()._persist(val, "facebook", api_key)