Example #1
0
 def _domain_search(self, domain, api_key="", name=""):
     qry = 'site:zoominfo.com/c/ {0}'.format(domain)
     df = Google().search(qry)
     if df.empty: 
         data = {'company_name': name, "domain":domain}
         return CompanyInfoCrawl()._persist(data,"zoominfo",api_key)
     df['_name'] = [i.split("Company Profile")[0].strip() 
                    for i in df.link_text]
     df["score"] = [fuzz.ratio(b, name) for b in df._name]
     df = df[df.score > 70]
     df = df.sort('score',ascending=False)
     if df.empty: 
       data = {'company_name': name, "domain":domain}
       return CompanyInfoCrawl()._persist(data,"zoominfo",api_key)
     df = df.reset_index().drop('index',1)
     url = df.ix[0].link
     print "ZOOMINFO URL", url
     html = Google().cache(url)
     html = requests.get(url).text
     html = self._remove_non_ascii(html)
     zoominfo = self._cache_html_to_df(html)
     zoominfo['company_name'] = name
     zoominfo['handle'] = url
     zoominfo["domain_search"] = True
     zoominfo["domain"] = domain
     print zoominfo
     CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
 def _domain_search(self, domain, api_key="", name=""):
     qry = 'site:zoominfo.com/c/ {0}'.format(domain)
     df = Google().search(qry)
     if df.empty:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, "zoominfo", api_key)
     df['_name'] = [
         i.split("Company Profile")[0].strip() for i in df.link_text
     ]
     df["score"] = [fuzz.ratio(b, name) for b in df._name]
     df = df[df.score > 70]
     df = df.sort('score', ascending=False)
     if df.empty:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, "zoominfo", api_key)
     df = df.reset_index().drop('index', 1)
     url = df.ix[0].link
     print "ZOOMINFO URL", url
     html = Google().cache(url)
     html = requests.get(url).text
     html = self._remove_non_ascii(html)
     zoominfo = self._cache_html_to_df(html)
     zoominfo['company_name'] = name
     zoominfo['handle'] = url
     zoominfo["domain_search"] = True
     zoominfo["domain"] = domain
     print zoominfo
     CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
Example #3
0
    def _company_blog(self, domain, api_key="", name=""):
        #TODO get blog url
        df = Google().search('inurl:blog site:{0}'.format(domain), 1)
        print df
        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index', 1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num) + " " + " ".join(_date.split(" ")[1:])
            try:
                timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                    num = int(i.split())
                    timestamps.append(arrow.utcnow().replace(days=num *
                                                             -1).timestamp)
                else:
                    timestamps.append(0)
        df["timestamp"] = timestamps

        data = {'data': df.to_dict('r'), 'blog_url': url}
        data["domain"] = domain
        data["api_key"] = api_key
        data["company_name"] = name
        CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
Example #4
0
    def _company_blog(self, domain, api_key="", name=""):
        #TODO get blog url
        df = Google().search('inurl:blog site:{0}'.format(domain), 1)
        print df
        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index',1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num)+" "+" ".join(_date.split(" ")[1:])
            try:
              timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                  num = int(i.split())
                  timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp)
                else:
                  timestamps.append(0)
        df["timestamp"] = timestamps

        data = {'data': df.to_dict('r'), 'blog_url':url}
        data["domain"] = domain
        data["api_key"] = api_key
        data["company_name"] = name
        CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
    def _company_blog(self, domain, period=None):
        #TODO get blog url
        if period:
            df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d")
        else:
            df = Google().search('inurl:blog site:{0}'.format(domain), 1)

        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index', 1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num) + " " + " ".join(_date.split(" ")[1:])
            try:
                timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                    num = int(i.split())
                    timestamps.append(arrow.utcnow().replace(days=num *
                                                             -1).timestamp)
                else:
                    timestamps.append(0)
        df["timestamp"] = timestamps
        data = df
        print data
        data["domain"] = domain
        data["event_type"] = "CompanyBlogEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = [
            "".join(map(str, _data.values()))[:124]
            for _data in data.to_dict("r")
        ]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data
Example #6
0
    def _company_blog(self, domain, period=None):
        #TODO get blog url
        if period:
          df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d")
        else:
          df = Google().search('inurl:blog site:{0}'.format(domain), 1)

        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index',1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num)+" "+" ".join(_date.split(" ")[1:])
            try:
              timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                  num = int(i.split())
                  timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp)
                else:
                  timestamps.append(0)
        df["timestamp"] = timestamps
        data = df
        print data
        data["domain"] = domain
        data["event_type"] = "CompanyBlogEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = ["".join(map(str, _data.values()))[:124]
                             for _data in data.to_dict("r")]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data
Example #7
0
 def _company_profile(self, name, api_key=""):
     df = Google().search('site:indeed.com/cmp {0}'.format(name))
     if df.empty: 
         return CompanyInfoCrawl()._persist({'company_name': name}, 
                                            "indeed", api_key)
     df['_name'] = [i.split("Careers and Employment")[0].strip() 
                    for i in df.link_text]
     df["score"] = [fuzz.ratio(b, name) for b in df._name]
     df = df[df.score > 70]
     df = df.reset_index().drop('index',1)
     df = df.sort('score',ascending=False)
     if df.empty: 
       return CompanyInfoCrawl()._persist({'company_name': name},"indeed",api_key)
     else:
       url = df.ix[0].link
     val = self._html_to_dict(url)
     print "name"
     val["handle"] = url
     val['company_name'] = name
     print val
     CompanyInfoCrawl()._persist(val, "indeed", api_key)
Example #8
0
 def _company_profile(self, name, api_key=""):
     df = Google().search('site:indeed.com/cmp {0}'.format(name))
     if df.empty:
         return CompanyInfoCrawl()._persist({'company_name': name},
                                            "indeed", api_key)
     df['_name'] = [
         i.split("Careers and Employment")[0].strip() for i in df.link_text
     ]
     df["score"] = [fuzz.ratio(b, name) for b in df._name]
     df = df[df.score > 70]
     df = df.reset_index().drop('index', 1)
     df = df.sort('score', ascending=False)
     if df.empty:
         return CompanyInfoCrawl()._persist({'company_name': name},
                                            "indeed", api_key)
     else:
         url = df.ix[0].link
     val = self._html_to_dict(url)
     print "name"
     val["handle"] = url
     val['company_name'] = name
     print val
     CompanyInfoCrawl()._persist(val, "indeed", api_key)