def _domain_search(self, domain, api_key="", name=""): qry = 'site:zoominfo.com/c/ {0}'.format(domain) df = Google().search(qry) if df.empty: data = {'company_name': name, "domain":domain} return CompanyInfoCrawl()._persist(data,"zoominfo",api_key) df['_name'] = [i.split("Company Profile")[0].strip() for i in df.link_text] df["score"] = [fuzz.ratio(b, name) for b in df._name] df = df[df.score > 70] df = df.sort('score',ascending=False) if df.empty: data = {'company_name': name, "domain":domain} return CompanyInfoCrawl()._persist(data,"zoominfo",api_key) df = df.reset_index().drop('index',1) url = df.ix[0].link print "ZOOMINFO URL", url html = Google().cache(url) html = requests.get(url).text html = self._remove_non_ascii(html) zoominfo = self._cache_html_to_df(html) zoominfo['company_name'] = name zoominfo['handle'] = url zoominfo["domain_search"] = True zoominfo["domain"] = domain print zoominfo CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
def _domain_search(self, domain, api_key="", name=""): qry = 'site:zoominfo.com/c/ {0}'.format(domain) df = Google().search(qry) if df.empty: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, "zoominfo", api_key) df['_name'] = [ i.split("Company Profile")[0].strip() for i in df.link_text ] df["score"] = [fuzz.ratio(b, name) for b in df._name] df = df[df.score > 70] df = df.sort('score', ascending=False) if df.empty: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, "zoominfo", api_key) df = df.reset_index().drop('index', 1) url = df.ix[0].link print "ZOOMINFO URL", url html = Google().cache(url) html = requests.get(url).text html = self._remove_non_ascii(html) zoominfo = self._cache_html_to_df(html) zoominfo['company_name'] = name zoominfo['handle'] = url zoominfo["domain_search"] = True zoominfo["domain"] = domain print zoominfo CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
def _company_blog(self, domain, api_key="", name=""): #TODO get blog url df = Google().search('inurl:blog site:{0}'.format(domain), 1) print df if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index', 1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num) + " " + " ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num * -1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = {'data': df.to_dict('r'), 'blog_url': url} data["domain"] = domain data["api_key"] = api_key data["company_name"] = name CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
def _company_blog(self, domain, api_key="", name=""): #TODO get blog url df = Google().search('inurl:blog site:{0}'.format(domain), 1) print df if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index',1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num)+" "+" ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = {'data': df.to_dict('r'), 'blog_url':url} data["domain"] = domain data["api_key"] = api_key data["company_name"] = name CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
def _company_blog(self, domain, period=None): #TODO get blog url if period: df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d") else: df = Google().search('inurl:blog site:{0}'.format(domain), 1) if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index', 1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num) + " " + " ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num * -1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = df print data data["domain"] = domain data["event_type"] = "CompanyBlogEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = [ "".join(map(str, _data.values()))[:124] for _data in data.to_dict("r") ] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data
def _company_blog(self, domain, period=None): #TODO get blog url if period: df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d") else: df = Google().search('inurl:blog site:{0}'.format(domain), 1) if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index',1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num)+" "+" ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = df print data data["domain"] = domain data["event_type"] = "CompanyBlogEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = ["".join(map(str, _data.values()))[:124] for _data in data.to_dict("r")] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data
def _company_profile(self, name, api_key=""): df = Google().search('site:indeed.com/cmp {0}'.format(name)) if df.empty: return CompanyInfoCrawl()._persist({'company_name': name}, "indeed", api_key) df['_name'] = [i.split("Careers and Employment")[0].strip() for i in df.link_text] df["score"] = [fuzz.ratio(b, name) for b in df._name] df = df[df.score > 70] df = df.reset_index().drop('index',1) df = df.sort('score',ascending=False) if df.empty: return CompanyInfoCrawl()._persist({'company_name': name},"indeed",api_key) else: url = df.ix[0].link val = self._html_to_dict(url) print "name" val["handle"] = url val['company_name'] = name print val CompanyInfoCrawl()._persist(val, "indeed", api_key)
def _company_profile(self, name, api_key=""): df = Google().search('site:indeed.com/cmp {0}'.format(name)) if df.empty: return CompanyInfoCrawl()._persist({'company_name': name}, "indeed", api_key) df['_name'] = [ i.split("Careers and Employment")[0].strip() for i in df.link_text ] df["score"] = [fuzz.ratio(b, name) for b in df._name] df = df[df.score > 70] df = df.reset_index().drop('index', 1) df = df.sort('score', ascending=False) if df.empty: return CompanyInfoCrawl()._persist({'company_name': name}, "indeed", api_key) else: url = df.ix[0].link val = self._html_to_dict(url) print "name" val["handle"] = url val['company_name'] = name print val CompanyInfoCrawl()._persist(val, "indeed", api_key)