Example #1
0
    def _press_releases(self, domain, api_key="", company_name=""):
        ''' Google News, PRNewsWire, BusinessWire '''
        pw = Google().search('"{0}" site:prnewswire.com'.format(company_name))
        bw = Google().search(
            '"{0}" site:businesswire.com'.format(company_name))
        mw = Google().search('"{0}" site:marketwired.com'.format(company_name))
        nw = Google().search('"{0}" site:newswire.ca'.format(company_name))
        rt = Google().search('"{0}" site:reuters.com'.format(company_name))

        p = pd.concat([pw, bw, mw, nw, rt])
        p = p.drop_duplicates()
        p['date'] = [
            span.split('Business Wire')[-1].split('...')[0].strip()
            for span in p.link_span
        ]
        p['description'] = [
            "".join(span.split('...')[1:]).strip() for span in p.link_span
        ]
        p['date'] = [span.split('...')[0].strip() for span in p.link_span]
        p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date]
        p['title'] = p['link_text']

        p = p.drop('link_text', 1)
        p = p.drop('url', 1)
        p = p.drop('link_span', 1)
        #for i in p.timestamp: print i

        press = {'data': p.to_dict('records'), 'company_name': company_name}
        press["domain"] = domain
 def _domain_search(self, domain, api_key="", name=""):
     qry = 'site:zoominfo.com/c/ {0}'.format(domain)
     df = Google().search(qry)
     if df.empty:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, "zoominfo", api_key)
     df['_name'] = [
         i.split("Company Profile")[0].strip() for i in df.link_text
     ]
     df["score"] = [fuzz.ratio(b, name) for b in df._name]
     df = df[df.score > 70]
     df = df.sort('score', ascending=False)
     if df.empty:
         data = {'company_name': name, "domain": domain}
         return CompanyInfoCrawl()._persist(data, "zoominfo", api_key)
     df = df.reset_index().drop('index', 1)
     url = df.ix[0].link
     print "ZOOMINFO URL", url
     html = Google().cache(url)
     html = requests.get(url).text
     html = self._remove_non_ascii(html)
     zoominfo = self._cache_html_to_df(html)
     zoominfo['company_name'] = name
     zoominfo['handle'] = url
     zoominfo["domain_search"] = True
     zoominfo["domain"] = domain
     print zoominfo
     CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
    def _recent_webpages_published(self, domain, period=None):
        if period:
            df = Google().search("site:{0}".format(domain), 1, "d")
            #df2 = Google().search("{0}".format(name), 1, "d")
        else:
            df = Google().search("site:{0}".format(domain))
            #df2 = Google().search("{0}".format(name))

        # TODO - add timestamps
        # TODO - queue scrapes
        #df = pd.concat([df, df2])
        data = df
        if data.empty: return "NO RECENT WEBPAGES"
        data["domain"] = domain
        data["event_type"] = "RecentWebpageEvent"
        print data
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        print data
        data["event_key"] = [
            "".join(map(str,
                        _data.to_dict().values()))[:124]
            for i, _data in data.iterrows()
        ]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
Example #4
0
    def _employees(self, company_name="", keyword=None):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword:
            results['score'] = [
                fuzz.partial_ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Example #5
0
 def _daily_news(self, domain, api_key="", name=""):
     df = Google().search('site:twitter.com {0}'.format(domain))
     link = df.link.tolist()[0]
     html = Google().cache(link)
     tweets = self._tweets(html, api_key)
     data = {"data": tweets, "company_name": name, "domain": domain}
     CompanyExtraInfoCrawl()._persist(data, "tweets")
Example #6
0
 def _reviews(self, domain, api_key="", name=""):
     df = Google().search('site:glassdoor.com/reviews {0}'.format(name))
     if df.empty: return
     url = df.ix[0].link
     r = BeautifulSoup(Google().cache(url))
     rating = r.find('div', {'class': 'ratingNum'})
     rating = rating.text if rating else ""
     # TODO - awards
     reviews = pd.DataFrame()
     for review in r.find_all('li', {'class': 'empReview'}):
         pros = review.find('p', {'class': 'pros'})
         cons = review.find('p', {'class': 'cons'})
         extra = review.find('p', {'class': 'notranslate'})
         summary = review.find('span', {'class': 'summary'})
         date = review.find('time', {'class': 'date'})
         vals = [pros, cons, extra, summary, date]
         cols = ["pros", "cons", "extra", "summary", "date"]
         vals = [val.text.strip() for val in vals]
         data = dict(zip(cols, vals))
         data["timestamp"] = Helper()._str_to_timestamp(data["date"])
         reviews = reviews.append(data, ignore_index=True)
     data = {'data': reviews.to_dict('r'), 'company_name': name}
     data['api_key'] = api_key
     data['domain'] = domain
     CompanyExtraInfoCrawl()._persist(data, "glassdoor_reviews", api_key)
Example #7
0
    def _parse_response(self, html, company_name, keyword=None):
        results = Google()._results_html_to_df(html)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword:
            results['score'] = [
                fuzz.partial_ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Example #8
0
    def _old_parse_article_html(self, objectId, title, industry_press=None):
        df = Google().search("{0} site:marketwired.com".format(title))
        html = Google().cache(df.link.tolist()[0])
        article = BeautifulSoup(html).find("div",{"class":"mw_release"})
        article = article.text if article else None
        #company_name = BeautifulSoup(html).find("span",{"itemprop":"name"})
        company_name = BeautifulSoup(html).find("strong")
        company_name = company_name.split("SOURCE:")[-1] if company_name else None
        #q.enqueue(ClearSpark()._bulk_company_info, company_name)
        links, website = [], None
        for a in BeautifulSoup(html).find_all("a"):
            if "href" not in a.attrs: continue
            href = a["href"].lower()
            if "http" not in href: continue
            elif "marketwire" in href: continue
            elif "javascript" in href: continue
            elif "linkedin" in href: continue
            elif "twitter" in href: continue
            elif "youtube" in href: continue
            elif "flickr" in href: continue
            elif "facebook" in href: continue
            elif "google" in href: continue
            elif "addthis" in href: continue
            elif "sysomos" in href: continue

            if "target" in a.attrs:
                website = a["href"]
            links.append(href.strip())

        info = {"article": article, "company_name": company_name, 
                "website":website, "links":links}
        return info
Example #9
0
    def search_webhook(self, domain, objectId):
        pw = Google().search('"{0}" site:prnewswire.com'.format(domain))
        bw = Google().search('"{0}" site:businesswire.com'.format(domain))
        job_queue_lol = objectId + str(arrow.now().timestamp)

        if not pw.empty:
            for link in pw.link:
                print "PW STARTED", pw.shape, link
                job = q.enqueue(PRNewsWire()._email_webhook,
                                domain,
                                link,
                                job_queue_lol,
                                objectId,
                                timeout=3600)
                job.meta['profile_id1'] = job_queue_lol
                job.save()
        print len(q.jobs)

        if not bw.empty:
            for link in bw.link:
                print "BW STARTED", bw.shape, link
                job = q.enqueue(BusinessWire()._email_webhook,
                                domain,
                                link,
                                job_queue_lol,
                                objectId,
                                timeout=3600)
                job.meta['profile_id1'] = job_queue_lol
                job.save()
        print len(q.jobs)
Example #10
0
    def streaming_search(self, domain):
        pw = Google().search('"{0}" site:prnewswire.com'.format(domain))
        bw = Google().search('"{0}" site:businesswire.com'.format(domain))

        for link in pw.link:
            pn_emails = PRNewsWire()._find_emails(domain, link, False)
        for link in bw.link:
            bw_emails = BusinessWire()._find_emails(domain, link, False)
        ''' enqueue and return values '''
        return pd.concat([pn_emails, bw_emails]).drop_duplicates('pattern')
Example #11
0
  def _email_search(self, email, api_key=""):
      try:
          person = clearbit.Person.find(email=email, stream=True)
      except:
          person = None
      data = {"pattern":None, "name":None, "email":email,
              "domain":email.split("@")[-1], "crawl_source":"email_hunter"}
      if person:
          pattern = EmailGuessHelper()._find_email_pattern(person["name"]["fullName"], email)
          if pattern: 
              data = {"pattern":pattern, "name":person["name"]["fullName"], "email":email,
                      "domain":email.split("@")[-1], "crawl_source":"email_hunter"}
      elif not person or not pattern:
          person = FullContact()._person_from_email(email)
          print person
          try:
              person = person["contactInfo"]["fullName"]
              fullcontact_person = True
          except:
              fullcontact_person = False

          if fullcontact_person:
              person = person["contactInfo"]["fullName"]
              pattern = EmailGuessHelper()._find_email_pattern(person, email)
              data = {"pattern":pattern, "name":person, "email":email,
                      "domain":email.split("@")[-1], "crawl_source":"email_hunter"}
              print pattern
          else:
              _email = email.replace(".", " ").replace("-", " ").replace("_"," ")
              _email = _email.replace("@", " ")
              g = Google().search("{0} site:linkedin.com/pub".format(_email))
              g1 = Google().search("{0} site:linkedin.com/pub".format(_email.split(" "[0])))
              g2 = Google().search("{0} site:linkedin.com/pub".format(_email).split(" ")[-1])
              g = pd.concat([g, g1, g2])
              choices = [i.split(" |")[0] for i in g.link_text]
              person = process.extract(_email, choices, limit=1)
              try:
                person = person[0][0]
              except:
                ''' '''
              pattern = EmailGuessHelper()._find_email_pattern(person, email)
              print "google search pattern", pattern
              if pattern:
                  data = {"pattern":pattern, "name":person, "email":email,
                          "domain":email.split("@")[-1], "crawl_source":"email_hunter"}
              else:
                  data = {"pattern":None, "name":None, "email":email,
                          "domain":email.split("@")[-1], "crawl_source":"email_hunter"}
      #data = pd.DataFrame([data])
      conn = r.connect(host="localhost", port=28015, db="triggeriq")
      r.table('email_pattern_crawls').insert(data).run(conn)
      #CompanyEmailPatternCrawl()._persist(data, "emailhunter", api_key)
      # persist to rethinkdb
      print "person", person
Example #12
0
 def _daily_news(self, domain, api_key="",  name=""):
     df = Google().search("site:linkedin.com/company {0}".format(domain))
     if df.empty: return 
     #for link in df.link:
     link = df.link.tolist()[0]
     print link
     html = Google().cache(link)
     posts = self._company_posts(html)
     #Linkedin()._signal(link, api_key)
     data = {"data":posts, "company_name":name, "domain":domain}
     CompanyExtraInfoCrawl()._persist(data, "linkedin_posts", api_key)
Example #13
0
 def _company_profile(self, name, api_key=""):
     df = Google().search('site:facebook.com {0}'.format(name))
     if df.empty: return df
     url = df.link.tolist()[0]
     html = Google().cache(url)
     #browser = Browser('phantomjs')
     #browser.visit(url)
     val = self._html_to_dict(html)
     print val
     val["company_name"] = name
     CompanyInfoCrawl()._persist(val, "facebook", api_key)
Example #14
0
 def _daily_news(self, domain, api_key="", name=""):
     df = Google().search('site:facebook.com {0}'.format(domain))
     link = df.link.tolist()[0]
     html = Google().cache(link)
     posts = Facebook()._posts(html)
     posts = pd.DataFrame(posts).fillna("")
     data = {
         "data": posts.to_dict("r"),
         "domain": domain,
         "company_name": name
     }
     CompanyExtraInfoCrawl()._persist(data, "facebook_posts", api_key)
Example #15
0
 def _news(self, domain, api_key="", company_name="", period=None):
     name = domain.split(".")[0] if company_name == "" else company_name
     if period:
         df = Google().news_search(name, 1, period)
     else:
         df = Google().news_search(name)
     print df
     data = {'data': df.to_dict('r'), 'site_url': domain}
     data["domain"] = domain
     data["api_key"] = api_key
     data["company_name"] = company_name
     CompanyExtraInfoCrawl()._persist(data, "general_news", api_key)
Example #16
0
    def _directory_search(self, name, description):
        qry = name+' "{0}" site:linkedin.com/pub/dir'.format(description)
        qry = filter(lambda x: x in string.printable, qry)
        results = Google().search(qry)
        count = 0
        while results.empty:
            print "trying again"
            results = Google().search(qry)
            count = count + 1
            if count > 2: break

        print results
        return results.url if not results.empty else []
Example #17
0
 def _domain_search(self, domain, api_key="", name=""):
     df = Google().search('site:facebook.com {0}'.format(domain))
     for url in df.link:
         #browser = Browser('phantomjs')
         #browser.visit(url)
         # html = browser.html
         html = Google().cache(url)
         if domain not in BeautifulSoup(html).text: continue
         val = self._html_to_dict(html)
         val["company_name"] = name
         val["domain"] = domain
         CompanyInfoCrawl()._persist(val, "facebook", api_key)
         break
Example #18
0
    def _news(self, domain, api_key="", company_name=""):
        # TODO - include general info links
        browser = Browser('phantomjs')
        browser.visit('http://google.com')
        browser.find_by_name('q').first.fill(company_name)
        browser.find_by_name('btnG').first.click()
        browser.find_link_by_text('News').first.click()
        url = browser.evaluate_script("document.URL")
        url = url + "&tbs=qdr:m,sbd:1" + "&num=100&filter=0&start=0"
        browser.visit(url)
        pages = pd.DataFrame()
        df = Google()._results_html_to_df(browser.html)

        pages = pages.append(df)
        #print browser.find_by_css('td > a')
        if browser.find_by_css('td > a') == []:
            pages = pages.to_dict('r')
            pages = {
                'data': pages,
                'company_name': company_name,
                "domain": domain
            }
            CompanyExtraInfoCrawl()._persist(pages, "general_news", api_key)

        try:
            _next = browser.find_by_css('td > a')[-1].text
        except:
            _next = None
        if _next:
            while "Next" in _next:
                browser.find_by_css('td > a')[-1].click()
                df = Google()._results_html_to_df(browser.html)
                pages = pages.append(df)

        #pages = pages[~pages.title.str.contains("press release")]
        pages = pages[pages.link_span.str.contains(
            '(?i){0}'.format(company_name))]
        pages.columns = ['link', 'description', 'title', 'info', '']
        pages['date'] = [i.split('-')[-1] for i in pages['info']]
        pages["timestamp"] = [
            Helper()._str__to_timestamp(i) for i in pages.date
        ]
        pages['news_source'] = [i.split('-')[0] for i in pages['info']]
        pages = pages.drop_duplicates()
        del pages[""]
        print pages.columns

        pages = pages.to_dict('r')
        pages = {'data': pages, 'company_name': company_name, "domain": domain}
        CompanyExtraInfoCrawl()._persist(pages, "general_news", api_key)
Example #19
0
    def garble(self, content, time=10):
        print(content)
        print("--------")
        translator = Google()
        self.curr = content
        while (time > 0):
            print("current time is " + str(time))
            self.curr = translator.translate(
                'zh-CN',
                'fr',
                self.curr,
            )
            self.curr = translator.translate(
                'fr',
                'ko',
                self.curr,
            )
            self.curr = translator.translate(
                'ko',
                'zh-CN',
                self.curr,
            )
            time -= 1

        return self.curr
Example #20
0
 def _search(self, company_name, api_key=""):
     qry = 'site:linkedin.com inurl:"at-{0}" inurl:title -inurl:job'
     #TODO - remove, all [".","'",","]
     name = company_name.strip().lower().replace(" ", "-")
     dirs = Google().search(qry.format(name), 1)
     for url in dirs.url:
         q.enqueue(LinkedinTitleDir().parse, url, company_name)
Example #21
0
    def _company_blog(self, domain, api_key="", name=""):
        #TODO get blog url
        df = Google().search('inurl:blog site:{0}'.format(domain), 1)
        print df
        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index', 1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num) + " " + " ".join(_date.split(" ")[1:])
            try:
                timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                    num = int(i.split())
                    timestamps.append(arrow.utcnow().replace(days=num *
                                                             -1).timestamp)
                else:
                    timestamps.append(0)
        df["timestamp"] = timestamps

        data = {'data': df.to_dict('r'), 'blog_url': url}
        data["domain"] = domain
        data["api_key"] = api_key
        data["company_name"] = name
        CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
Example #22
0
 def _signal(self, link, api_key=""):
     html = Google().cache(link)
     info = self._html_to_dict(html)
     posts = self._posts(html)
     CompanyInfoCrawl()._persist(info, "facebook", api_key)
     for post in posts:
         CompanyExtraInfoCrawl()._persist(post, "facebook_posts", api_key)
Example #23
0
 def _html_to_dict(self, url):
     r = BeautifulSoup(Google().cache(url))
     logo = r.find('div', {'class': 'logo'})
     if logo:
         logo = logo.find('img')
         logo = logo['src'] if logo else ""
     else:
         logo = ""
     #website = r.find('span',{'class':'hideHH'}).text
     info = r.find('div', {'id': 'EmpBasicInfo'})
     if info:
         info = info.find_all('div', {'class': 'empInfo'})
     else:
         return {}
     info = dict([[
         i.find('strong').text.lower().strip(),
         i.find('span').text.strip()
     ] for i in info])
     info['name'] = r.find('div', {'class': 'header'}).find('h1').text
     info['description'] = r.find('p', {'id': 'EmpDescription'})
     info['description'] = info['description'].text if info[
         'description'] else ""
     info['logo'] = logo
     info['handle'] = url
     return info
 def _signal(self, link, api_key=""):
     html = Google().cache(link)
     info = self._html_to_dict(html)
     tweets = self._tweets(html)
     CompanyInfoCrawl()._persist(info, "twitter", api_key)
     for tweet in tweets:
         CompanyExtraInfoCrawl()._persist(tweet, "tweets", api_key)
Example #25
0
 def _linkedin_profile_from_name(self, company_name):
     qry = company_name+' site:linkedin.com/company'
     google_results = Google().search(qry)
     if google_results.empty: return "not found"
     url = google_results.ix[0].url
     # scrape cache
     return url if "/company/" in url else "not found"
Example #26
0
 def init(self):
     self.google = Google()
     self.bar = [i for i in u'.' * self.size]
     self.size = float(self.size)
     self.min = float(self.min)
     self.max = float(self.max)
     self.range = self.max - self.min
Example #27
0
def main():

    #fetch database credentials from env variables
    db_name = os.environ["ITIME_DB"]
    db_user = os.environ["ITIME_DB_USER"]
    db_password = os.environ["ITIME_DB_PASSWORD"]

    db = Database(db_name, db_user, db_password)
    connection_tries = 0

    #try to establish db connection, quit if it fails
    while (not db.connect()):
        print("Trying to reconnect to db,try starting postgres")
        time.sleep(5)
        if (connection_tries > 1):
            sys.exit(0)
        connection_tries += 1

    #fetch google client secret file path
    google_api_file = os.environ["ITIME_GOOGLE_API_FILE"]
    google = Google(google_api_file)

    controller = Controller(db, google)

    #Config for rabbitmq
    rabbit_server = os.environ["ITIME_RABBIT_SERVER"]
    rabbit_queue = os.environ["ITIME_RABBIT_US_QUEUE"]

    rabbit = AmqpServer(rabbit_server, rabbit_queue, controller.incoming)
    rabbit.start()
    print("Exiting...")
Example #28
0
    def _press_search(self, domain, api_key):
        pw = Google().search('"{0}" site:prnewswire.com'.format(domain))
        bw = Google().search('"{0}" site:businesswire.com'.format(domain))
        #job_queue_lol = objectId+str(arrow.now().timestamp)
        print bw, pw
        pw = pw if not pw.empty else pd.DataFrame(columns=["link"])
        bw = pw if not bw.empty else pd.DataFrame(columns=["link"])
        queue = "press-check-" + domain
        for link in pw.link:
            job = q.enqueue(PRNewsWire()._email, domain, link, timeout=3600)
            RQueue()._meta(job, "{0}_{1}".format(domain, api_key))

        for link in bw.link:
            job = q.enqueue(BusinessWire()._email, domain, link, timeout=3600)
            RQueue()._meta(job, "{0}_{1}".format(domain, api_key))
        '''
 def _reviews(self, domain, api_key="", name=""):
     df = Google().search('site:glassdoor.com/reviews {0}'.format(name))
     if df.empty: return
     url = df.ix[0].link
     r = BeautifulSoup(Crawlera().get(url).text)
     if not r.find("a", {"class": "sortByDate"}): return
     url = "http://glassdoor.com" + r.find("a",
                                           {"class": "sortByDate"})["href"]
     print url
     r = requests.get(
         "http://localhost:8950/render.html?url={0}".format(url))
     r = BeautifulSoup(r.text)
     rating = r.find('div', {'class': 'ratingNum'})
     rating = rating.text if rating else ""
     # TODO - awards
     reviews = pd.DataFrame()
     for review in r.find_all('li', {'class': 'empReview'}):
         pros = review.find('p', {'class': 'pros'})
         cons = review.find('p', {'class': 'cons'})
         extra = review.find('p', {'class': 'notranslate'})
         summary = review.find('span', {'class': 'summary'})
         date = review.find('time', {'class': 'date'})
         vals = [pros, cons, extra, summary, date]
         cols = ["pros", "cons", "extra", "summary", "date"]
         vals = [val.text.strip() if val else "" for val in vals]
         data = dict(zip(cols, vals))
         data["timestamp"] = Helper()._str_to_timestamp(data["date"])
         reviews = reviews.append(data, ignore_index=True)
     return reviews
Example #30
0
 def init(self):
     self.colorlib = self.madcow.colorlib
     try:
         self.learn = Learn(madcow=madcow)
     except:
         self.learn = None
     self.google = Google()