def _press_releases(self, domain, api_key="", company_name=""): ''' Google News, PRNewsWire, BusinessWire ''' pw = Google().search('"{0}" site:prnewswire.com'.format(company_name)) bw = Google().search( '"{0}" site:businesswire.com'.format(company_name)) mw = Google().search('"{0}" site:marketwired.com'.format(company_name)) nw = Google().search('"{0}" site:newswire.ca'.format(company_name)) rt = Google().search('"{0}" site:reuters.com'.format(company_name)) p = pd.concat([pw, bw, mw, nw, rt]) p = p.drop_duplicates() p['date'] = [ span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span ] p['description'] = [ "".join(span.split('...')[1:]).strip() for span in p.link_span ] p['date'] = [span.split('...')[0].strip() for span in p.link_span] p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date] p['title'] = p['link_text'] p = p.drop('link_text', 1) p = p.drop('url', 1) p = p.drop('link_span', 1) #for i in p.timestamp: print i press = {'data': p.to_dict('records'), 'company_name': company_name} press["domain"] = domain
def _domain_search(self, domain, api_key="", name=""): qry = 'site:zoominfo.com/c/ {0}'.format(domain) df = Google().search(qry) if df.empty: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, "zoominfo", api_key) df['_name'] = [ i.split("Company Profile")[0].strip() for i in df.link_text ] df["score"] = [fuzz.ratio(b, name) for b in df._name] df = df[df.score > 70] df = df.sort('score', ascending=False) if df.empty: data = {'company_name': name, "domain": domain} return CompanyInfoCrawl()._persist(data, "zoominfo", api_key) df = df.reset_index().drop('index', 1) url = df.ix[0].link print "ZOOMINFO URL", url html = Google().cache(url) html = requests.get(url).text html = self._remove_non_ascii(html) zoominfo = self._cache_html_to_df(html) zoominfo['company_name'] = name zoominfo['handle'] = url zoominfo["domain_search"] = True zoominfo["domain"] = domain print zoominfo CompanyInfoCrawl()._persist(zoominfo, "zoominfo", api_key)
def _recent_webpages_published(self, domain, period=None): if period: df = Google().search("site:{0}".format(domain), 1, "d") #df2 = Google().search("{0}".format(name), 1, "d") else: df = Google().search("site:{0}".format(domain)) #df2 = Google().search("{0}".format(name)) # TODO - add timestamps # TODO - queue scrapes #df = pd.concat([df, df2]) data = df if data.empty: return "NO RECENT WEBPAGES" data["domain"] = domain data["event_type"] = "RecentWebpageEvent" print data data = data.applymap(lambda x: self._remove_non_ascii(x)) print data data["event_key"] = [ "".join(map(str, _data.to_dict().values()))[:124] for i, _data in data.iterrows() ] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn)
def _employees(self, company_name="", keyword=None): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword: results['score'] = [ fuzz.partial_ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _daily_news(self, domain, api_key="", name=""): df = Google().search('site:twitter.com {0}'.format(domain)) link = df.link.tolist()[0] html = Google().cache(link) tweets = self._tweets(html, api_key) data = {"data": tweets, "company_name": name, "domain": domain} CompanyExtraInfoCrawl()._persist(data, "tweets")
def _reviews(self, domain, api_key="", name=""): df = Google().search('site:glassdoor.com/reviews {0}'.format(name)) if df.empty: return url = df.ix[0].link r = BeautifulSoup(Google().cache(url)) rating = r.find('div', {'class': 'ratingNum'}) rating = rating.text if rating else "" # TODO - awards reviews = pd.DataFrame() for review in r.find_all('li', {'class': 'empReview'}): pros = review.find('p', {'class': 'pros'}) cons = review.find('p', {'class': 'cons'}) extra = review.find('p', {'class': 'notranslate'}) summary = review.find('span', {'class': 'summary'}) date = review.find('time', {'class': 'date'}) vals = [pros, cons, extra, summary, date] cols = ["pros", "cons", "extra", "summary", "date"] vals = [val.text.strip() for val in vals] data = dict(zip(cols, vals)) data["timestamp"] = Helper()._str_to_timestamp(data["date"]) reviews = reviews.append(data, ignore_index=True) data = {'data': reviews.to_dict('r'), 'company_name': name} data['api_key'] = api_key data['domain'] = domain CompanyExtraInfoCrawl()._persist(data, "glassdoor_reviews", api_key)
def _parse_response(self, html, company_name, keyword=None): results = Google()._results_html_to_df(html) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company_name ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company_name ] if keyword: results['score'] = [ fuzz.partial_ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def _old_parse_article_html(self, objectId, title, industry_press=None): df = Google().search("{0} site:marketwired.com".format(title)) html = Google().cache(df.link.tolist()[0]) article = BeautifulSoup(html).find("div",{"class":"mw_release"}) article = article.text if article else None #company_name = BeautifulSoup(html).find("span",{"itemprop":"name"}) company_name = BeautifulSoup(html).find("strong") company_name = company_name.split("SOURCE:")[-1] if company_name else None #q.enqueue(ClearSpark()._bulk_company_info, company_name) links, website = [], None for a in BeautifulSoup(html).find_all("a"): if "href" not in a.attrs: continue href = a["href"].lower() if "http" not in href: continue elif "marketwire" in href: continue elif "javascript" in href: continue elif "linkedin" in href: continue elif "twitter" in href: continue elif "youtube" in href: continue elif "flickr" in href: continue elif "facebook" in href: continue elif "google" in href: continue elif "addthis" in href: continue elif "sysomos" in href: continue if "target" in a.attrs: website = a["href"] links.append(href.strip()) info = {"article": article, "company_name": company_name, "website":website, "links":links} return info
def search_webhook(self, domain, objectId): pw = Google().search('"{0}" site:prnewswire.com'.format(domain)) bw = Google().search('"{0}" site:businesswire.com'.format(domain)) job_queue_lol = objectId + str(arrow.now().timestamp) if not pw.empty: for link in pw.link: print "PW STARTED", pw.shape, link job = q.enqueue(PRNewsWire()._email_webhook, domain, link, job_queue_lol, objectId, timeout=3600) job.meta['profile_id1'] = job_queue_lol job.save() print len(q.jobs) if not bw.empty: for link in bw.link: print "BW STARTED", bw.shape, link job = q.enqueue(BusinessWire()._email_webhook, domain, link, job_queue_lol, objectId, timeout=3600) job.meta['profile_id1'] = job_queue_lol job.save() print len(q.jobs)
def streaming_search(self, domain): pw = Google().search('"{0}" site:prnewswire.com'.format(domain)) bw = Google().search('"{0}" site:businesswire.com'.format(domain)) for link in pw.link: pn_emails = PRNewsWire()._find_emails(domain, link, False) for link in bw.link: bw_emails = BusinessWire()._find_emails(domain, link, False) ''' enqueue and return values ''' return pd.concat([pn_emails, bw_emails]).drop_duplicates('pattern')
def _email_search(self, email, api_key=""): try: person = clearbit.Person.find(email=email, stream=True) except: person = None data = {"pattern":None, "name":None, "email":email, "domain":email.split("@")[-1], "crawl_source":"email_hunter"} if person: pattern = EmailGuessHelper()._find_email_pattern(person["name"]["fullName"], email) if pattern: data = {"pattern":pattern, "name":person["name"]["fullName"], "email":email, "domain":email.split("@")[-1], "crawl_source":"email_hunter"} elif not person or not pattern: person = FullContact()._person_from_email(email) print person try: person = person["contactInfo"]["fullName"] fullcontact_person = True except: fullcontact_person = False if fullcontact_person: person = person["contactInfo"]["fullName"] pattern = EmailGuessHelper()._find_email_pattern(person, email) data = {"pattern":pattern, "name":person, "email":email, "domain":email.split("@")[-1], "crawl_source":"email_hunter"} print pattern else: _email = email.replace(".", " ").replace("-", " ").replace("_"," ") _email = _email.replace("@", " ") g = Google().search("{0} site:linkedin.com/pub".format(_email)) g1 = Google().search("{0} site:linkedin.com/pub".format(_email.split(" "[0]))) g2 = Google().search("{0} site:linkedin.com/pub".format(_email).split(" ")[-1]) g = pd.concat([g, g1, g2]) choices = [i.split(" |")[0] for i in g.link_text] person = process.extract(_email, choices, limit=1) try: person = person[0][0] except: ''' ''' pattern = EmailGuessHelper()._find_email_pattern(person, email) print "google search pattern", pattern if pattern: data = {"pattern":pattern, "name":person, "email":email, "domain":email.split("@")[-1], "crawl_source":"email_hunter"} else: data = {"pattern":None, "name":None, "email":email, "domain":email.split("@")[-1], "crawl_source":"email_hunter"} #data = pd.DataFrame([data]) conn = r.connect(host="localhost", port=28015, db="triggeriq") r.table('email_pattern_crawls').insert(data).run(conn) #CompanyEmailPatternCrawl()._persist(data, "emailhunter", api_key) # persist to rethinkdb print "person", person
def _daily_news(self, domain, api_key="", name=""): df = Google().search("site:linkedin.com/company {0}".format(domain)) if df.empty: return #for link in df.link: link = df.link.tolist()[0] print link html = Google().cache(link) posts = self._company_posts(html) #Linkedin()._signal(link, api_key) data = {"data":posts, "company_name":name, "domain":domain} CompanyExtraInfoCrawl()._persist(data, "linkedin_posts", api_key)
def _company_profile(self, name, api_key=""): df = Google().search('site:facebook.com {0}'.format(name)) if df.empty: return df url = df.link.tolist()[0] html = Google().cache(url) #browser = Browser('phantomjs') #browser.visit(url) val = self._html_to_dict(html) print val val["company_name"] = name CompanyInfoCrawl()._persist(val, "facebook", api_key)
def _daily_news(self, domain, api_key="", name=""): df = Google().search('site:facebook.com {0}'.format(domain)) link = df.link.tolist()[0] html = Google().cache(link) posts = Facebook()._posts(html) posts = pd.DataFrame(posts).fillna("") data = { "data": posts.to_dict("r"), "domain": domain, "company_name": name } CompanyExtraInfoCrawl()._persist(data, "facebook_posts", api_key)
def _news(self, domain, api_key="", company_name="", period=None): name = domain.split(".")[0] if company_name == "" else company_name if period: df = Google().news_search(name, 1, period) else: df = Google().news_search(name) print df data = {'data': df.to_dict('r'), 'site_url': domain} data["domain"] = domain data["api_key"] = api_key data["company_name"] = company_name CompanyExtraInfoCrawl()._persist(data, "general_news", api_key)
def _directory_search(self, name, description): qry = name+' "{0}" site:linkedin.com/pub/dir'.format(description) qry = filter(lambda x: x in string.printable, qry) results = Google().search(qry) count = 0 while results.empty: print "trying again" results = Google().search(qry) count = count + 1 if count > 2: break print results return results.url if not results.empty else []
def _domain_search(self, domain, api_key="", name=""): df = Google().search('site:facebook.com {0}'.format(domain)) for url in df.link: #browser = Browser('phantomjs') #browser.visit(url) # html = browser.html html = Google().cache(url) if domain not in BeautifulSoup(html).text: continue val = self._html_to_dict(html) val["company_name"] = name val["domain"] = domain CompanyInfoCrawl()._persist(val, "facebook", api_key) break
def _news(self, domain, api_key="", company_name=""): # TODO - include general info links browser = Browser('phantomjs') browser.visit('http://google.com') browser.find_by_name('q').first.fill(company_name) browser.find_by_name('btnG').first.click() browser.find_link_by_text('News').first.click() url = browser.evaluate_script("document.URL") url = url + "&tbs=qdr:m,sbd:1" + "&num=100&filter=0&start=0" browser.visit(url) pages = pd.DataFrame() df = Google()._results_html_to_df(browser.html) pages = pages.append(df) #print browser.find_by_css('td > a') if browser.find_by_css('td > a') == []: pages = pages.to_dict('r') pages = { 'data': pages, 'company_name': company_name, "domain": domain } CompanyExtraInfoCrawl()._persist(pages, "general_news", api_key) try: _next = browser.find_by_css('td > a')[-1].text except: _next = None if _next: while "Next" in _next: browser.find_by_css('td > a')[-1].click() df = Google()._results_html_to_df(browser.html) pages = pages.append(df) #pages = pages[~pages.title.str.contains("press release")] pages = pages[pages.link_span.str.contains( '(?i){0}'.format(company_name))] pages.columns = ['link', 'description', 'title', 'info', ''] pages['date'] = [i.split('-')[-1] for i in pages['info']] pages["timestamp"] = [ Helper()._str__to_timestamp(i) for i in pages.date ] pages['news_source'] = [i.split('-')[0] for i in pages['info']] pages = pages.drop_duplicates() del pages[""] print pages.columns pages = pages.to_dict('r') pages = {'data': pages, 'company_name': company_name, "domain": domain} CompanyExtraInfoCrawl()._persist(pages, "general_news", api_key)
def garble(self, content, time=10): print(content) print("--------") translator = Google() self.curr = content while (time > 0): print("current time is " + str(time)) self.curr = translator.translate( 'zh-CN', 'fr', self.curr, ) self.curr = translator.translate( 'fr', 'ko', self.curr, ) self.curr = translator.translate( 'ko', 'zh-CN', self.curr, ) time -= 1 return self.curr
def _search(self, company_name, api_key=""): qry = 'site:linkedin.com inurl:"at-{0}" inurl:title -inurl:job' #TODO - remove, all [".","'",","] name = company_name.strip().lower().replace(" ", "-") dirs = Google().search(qry.format(name), 1) for url in dirs.url: q.enqueue(LinkedinTitleDir().parse, url, company_name)
def _company_blog(self, domain, api_key="", name=""): #TODO get blog url df = Google().search('inurl:blog site:{0}'.format(domain), 1) print df if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index', 1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num) + " " + " ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num * -1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = {'data': df.to_dict('r'), 'blog_url': url} data["domain"] = domain data["api_key"] = api_key data["company_name"] = name CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
def _signal(self, link, api_key=""): html = Google().cache(link) info = self._html_to_dict(html) posts = self._posts(html) CompanyInfoCrawl()._persist(info, "facebook", api_key) for post in posts: CompanyExtraInfoCrawl()._persist(post, "facebook_posts", api_key)
def _html_to_dict(self, url): r = BeautifulSoup(Google().cache(url)) logo = r.find('div', {'class': 'logo'}) if logo: logo = logo.find('img') logo = logo['src'] if logo else "" else: logo = "" #website = r.find('span',{'class':'hideHH'}).text info = r.find('div', {'id': 'EmpBasicInfo'}) if info: info = info.find_all('div', {'class': 'empInfo'}) else: return {} info = dict([[ i.find('strong').text.lower().strip(), i.find('span').text.strip() ] for i in info]) info['name'] = r.find('div', {'class': 'header'}).find('h1').text info['description'] = r.find('p', {'id': 'EmpDescription'}) info['description'] = info['description'].text if info[ 'description'] else "" info['logo'] = logo info['handle'] = url return info
def _signal(self, link, api_key=""): html = Google().cache(link) info = self._html_to_dict(html) tweets = self._tweets(html) CompanyInfoCrawl()._persist(info, "twitter", api_key) for tweet in tweets: CompanyExtraInfoCrawl()._persist(tweet, "tweets", api_key)
def _linkedin_profile_from_name(self, company_name): qry = company_name+' site:linkedin.com/company' google_results = Google().search(qry) if google_results.empty: return "not found" url = google_results.ix[0].url # scrape cache return url if "/company/" in url else "not found"
def init(self): self.google = Google() self.bar = [i for i in u'.' * self.size] self.size = float(self.size) self.min = float(self.min) self.max = float(self.max) self.range = self.max - self.min
def main(): #fetch database credentials from env variables db_name = os.environ["ITIME_DB"] db_user = os.environ["ITIME_DB_USER"] db_password = os.environ["ITIME_DB_PASSWORD"] db = Database(db_name, db_user, db_password) connection_tries = 0 #try to establish db connection, quit if it fails while (not db.connect()): print("Trying to reconnect to db,try starting postgres") time.sleep(5) if (connection_tries > 1): sys.exit(0) connection_tries += 1 #fetch google client secret file path google_api_file = os.environ["ITIME_GOOGLE_API_FILE"] google = Google(google_api_file) controller = Controller(db, google) #Config for rabbitmq rabbit_server = os.environ["ITIME_RABBIT_SERVER"] rabbit_queue = os.environ["ITIME_RABBIT_US_QUEUE"] rabbit = AmqpServer(rabbit_server, rabbit_queue, controller.incoming) rabbit.start() print("Exiting...")
def _press_search(self, domain, api_key): pw = Google().search('"{0}" site:prnewswire.com'.format(domain)) bw = Google().search('"{0}" site:businesswire.com'.format(domain)) #job_queue_lol = objectId+str(arrow.now().timestamp) print bw, pw pw = pw if not pw.empty else pd.DataFrame(columns=["link"]) bw = pw if not bw.empty else pd.DataFrame(columns=["link"]) queue = "press-check-" + domain for link in pw.link: job = q.enqueue(PRNewsWire()._email, domain, link, timeout=3600) RQueue()._meta(job, "{0}_{1}".format(domain, api_key)) for link in bw.link: job = q.enqueue(BusinessWire()._email, domain, link, timeout=3600) RQueue()._meta(job, "{0}_{1}".format(domain, api_key)) '''
def _reviews(self, domain, api_key="", name=""): df = Google().search('site:glassdoor.com/reviews {0}'.format(name)) if df.empty: return url = df.ix[0].link r = BeautifulSoup(Crawlera().get(url).text) if not r.find("a", {"class": "sortByDate"}): return url = "http://glassdoor.com" + r.find("a", {"class": "sortByDate"})["href"] print url r = requests.get( "http://localhost:8950/render.html?url={0}".format(url)) r = BeautifulSoup(r.text) rating = r.find('div', {'class': 'ratingNum'}) rating = rating.text if rating else "" # TODO - awards reviews = pd.DataFrame() for review in r.find_all('li', {'class': 'empReview'}): pros = review.find('p', {'class': 'pros'}) cons = review.find('p', {'class': 'cons'}) extra = review.find('p', {'class': 'notranslate'}) summary = review.find('span', {'class': 'summary'}) date = review.find('time', {'class': 'date'}) vals = [pros, cons, extra, summary, date] cols = ["pros", "cons", "extra", "summary", "date"] vals = [val.text.strip() if val else "" for val in vals] data = dict(zip(cols, vals)) data["timestamp"] = Helper()._str_to_timestamp(data["date"]) reviews = reviews.append(data, ignore_index=True) return reviews
def init(self): self.colorlib = self.madcow.colorlib try: self.learn = Learn(madcow=madcow) except: self.learn = None self.google = Google()