Esempio n. 1
0
 def _domain_search(self, domain, api_key="", name=""):
     qry = 'site:zoominfo.com/c/ {0}'.format(domain)
     df = Google().search(qry)
     if df.empty:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, "zoominfo", api_key)
     df['_name'] = [
         i.split("Company Profile")[0].strip() for i in df.link_text
     ]
     df["score"] = [fuzz.ratio(b, name) for b in df._name]
     df = df[df.score > 70]
     df = df.sort('score', ascending=False)
     if df.empty:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, "zoominfo", api_key)
     df = df.reset_index().drop('index', 1)
     url = df.ix[0].link
     print "ZOOMINFO URL", url
     html = Google().cache(url)
     html = requests.get(url).text
     html = self._remove_non_ascii(html)
     zoominfo = self._cache_html_to_df(html)
     zoominfo['company_name'] = name
     zoominfo['handle'] = url
     zoominfo["domain_search"] = True
     zoominfo["domain"] = domain
     print zoominfo
     CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
Esempio n. 2
0
 def _url_to_dict(self, name, url, api_key=""):
     html = Google().cache(url)
     info = self._company_cache_html_to_df(html)
     if type(info) is str:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, 'linkedin', api_key)
     info = json.loads(info.ix[0].to_json())
     info['company_name'] = name
     info['handle'] = url
     info["domain_search"] = True
     CompanyInfoCrawl()._persist(info, 'linkedin', api_key)
     return info
Esempio n. 3
0
 def _signal(self, link, api_key=""):
     html = Google().cache(link)
     info = self._html_to_dict(html)
     tweets = self._tweets(html)
     CompanyInfoCrawl()._persist(info, "twitter", api_key)
     for tweet in tweets:
         CompanyExtraInfoCrawl()._persist(tweet, "tweets", api_key)
Esempio n. 4
0
 def _signal(self, link, api_key=""):
     html = Google().cache(link)
     info = self._html_to_dict(html)
     posts = self._posts(html)
     CompanyInfoCrawl()._persist(info, "facebook", api_key)
     for post in posts:
         CompanyExtraInfoCrawl()._persist(post, "facebook_posts", api_key)
Esempio n. 5
0
 def _company_profile(self, company_name, api_key="", domain=""):
     qry = 'site:zoominfo.com/c/ {0}'.format(company_name)
     google_df = Google().search(qry)
     data = {'company_name': company_name, "domain": domain}
     if google_df.empty:
         return CompanyInfoCrawl()._persist(data, "zoominfo", api_key)
     url = google_df.ix[0].link
     print "ZOOMINFO URL", url
     html = Google().ec2_cache(url)
     html = requests.get(url).text
     html = self._remove_non_ascii(html)
     zoominfo = self._cache_html_to_df(html)
     zoominfo['company_name'] = company_name
     zoominfo['domain'] = domain
     zoominfo['handle'] = url
     print zoominfo
     CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
Esempio n. 6
0
 def _company_profile(self, company_name, api_key):
     qry = company_name + ' site:linkedin.com/company'
     google_results = Google().search(qry)
     if google_results.empty:
         return CompanyInfoCrawl()._persist({'company_name': company_name},
                                            'linkedin', api_key)
     url = google_results.ix[0].url
     html = Google().cache(url)
     info = self._company_cache_html_to_df(html)
     if type(info) is str:
         return CompanyInfoCrawl()._persist({'company_name': company_name},
                                            'linkedin', api_key)
     info = json.loads(info.ix[0].to_json())
     info['company_name'] = company_name
     info['handle'] = url
     CompanyInfoCrawl()._persist(info, 'linkedin', api_key)
     return info
Esempio n. 7
0
 def _company_profile(self, name, api_key=""):
     df = Google().search('site:twitter.com {0}'.format(name))
     if df.empty: return df
     url = df.link.tolist()[0]
     html = requests.get(url).text
     val = self._html_to_dict(html)
     val["company_name"] = name
     CompanyInfoCrawl()._persist(val, "twitter", api_key)
Esempio n. 8
0
 def _url_to_dict(self, company_name, url, api_key=""):
     html = Google().cache(url)
     #val = self._html_to_dict(html)
     zoominfo = self._cache_html_to_df(html)
     zoominfo['company_name'] = company_name
     zoominfo['domain'] = domain
     zoominfo['handle'] = url
     print zoominfo
     CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
Esempio n. 9
0
 def _domain_search(self, domain, api_key="", name=""):
     qry = 'site:linkedin.com/company {0}'.format(domain)
     google_results = Google().search(qry)
     if google_results.empty:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, 'linkedin', api_key)
     url = google_results.ix[0].url
     html = Google().cache(url)
     info = self._company_cache_html_to_df(html)
     if type(info) is str:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, 'linkedin', api_key)
     info = json.loads(info.ix[0].to_json())
     info['company_name'] = name
     info['handle'] = url
     info["domain_search"] = True
     if info["domain"] == domain:
         CompanyInfoCrawl()._persist(info, 'linkedin', api_key)
         return info
Esempio n. 10
0
 def _company_profile(self, name, api_key=""):
     df = Google().search('site:facebook.com {0}'.format(name))
     if df.empty: return df
     url = df.link.tolist()[0]
     html = Google().cache(url)
     #browser = Browser('phantomjs')
     #browser.visit(url)
     val = self._html_to_dict(html)
     print val
     val["company_name"] = name
     CompanyInfoCrawl()._persist(val, "facebook", api_key)
Esempio n. 11
0
 def _domain_search(self, domain, api_key="", name=""):
     df = Google().search('site:twitter.com {0}'.format(domain))
     for url in df.link:
         r = requests.get(url).text
         link = BeautifulSoup(r).find(
             'span', {'class': 'ProfileHeaderCard-urlText'})
         link = link.text.strip() if link else ""
         if domain not in link: continue
         val = self._html_to_dict(r)
         break
     val["company_name"] = name
     val["domain"] = domain
     CompanyInfoCrawl()._persist(val, "twitter", api_key)
Esempio n. 12
0
 def _domain_search(self, domain, api_key="", name=""):
     df = Google().search('site:facebook.com {0}'.format(domain))
     for url in df.link:
         #browser = Browser('phantomjs')
         #browser.visit(url)
         # html = browser.html
         html = Google().cache(url)
         if domain not in BeautifulSoup(html).text: continue
         val = self._html_to_dict(html)
         val["company_name"] = name
         val["domain"] = domain
         CompanyInfoCrawl()._persist(val, "facebook", api_key)
         break
Esempio n. 13
0
 def _url_to_dict(self, name, url, api_key=""):
     html = Google().cache(url)
     val = self._html_to_dict(html)
     print val
     val["company_name"] = name
     CompanyInfoCrawl()._persist(val, "facebook", api_key)