コード例 #1
0
 def _reviews(self, domain, api_key="", name=""):
     df = Google().search('site:glassdoor.com/reviews {0}'.format(name))
     if df.empty: return
     url = df.ix[0].link
     r = BeautifulSoup(Crawlera().get(url).text)
     if not r.find("a", {"class": "sortByDate"}): return
     url = "http://glassdoor.com" + r.find("a",
                                           {"class": "sortByDate"})["href"]
     print url
     r = requests.get(
         "http://localhost:8950/render.html?url={0}".format(url))
     r = BeautifulSoup(r.text)
     rating = r.find('div', {'class': 'ratingNum'})
     rating = rating.text if rating else ""
     # TODO - awards
     reviews = pd.DataFrame()
     for review in r.find_all('li', {'class': 'empReview'}):
         pros = review.find('p', {'class': 'pros'})
         cons = review.find('p', {'class': 'cons'})
         extra = review.find('p', {'class': 'notranslate'})
         summary = review.find('span', {'class': 'summary'})
         date = review.find('time', {'class': 'date'})
         vals = [pros, cons, extra, summary, date]
         cols = ["pros", "cons", "extra", "summary", "date"]
         vals = [val.text.strip() if val else "" for val in vals]
         data = dict(zip(cols, vals))
         data["timestamp"] = Helper()._str_to_timestamp(data["date"])
         reviews = reviews.append(data, ignore_index=True)
     return reviews
コード例 #2
0
 def _html_to_dict(self, url):
     r = Crawlera().get(url).text
     try:
         name = BeautifulSoup(r).find('h1', {'id': 'company_name'}).text
     except:
         return {}
     desc = BeautifulSoup(r).find('span', {'id': 'desc_short'})
     desc = desc.text if desc else ""
     data = {'name': name, 'description': desc}
     content = BeautifulSoup(r).find_all('td',
                                         {'class': 'metadata_content'})
     links = []
     for c in content:
         links = links + c.find_all('a')
     for i in links:
         if "website" in i.text:
             print i['href']
             website = urllib.unquote(i['href']).split('=')[1]
             website = website.split('?')[0].split('&')[0]
             domain = "{}.{}".format(
                 tldextract.extract(website).domain,
                 tldextract.extract(website).tld)
             data["website"] = website
             data["domain"] = domain
     return data
コード例 #3
0
    def _search(self, qry, page, location='', country=None):
        ''' 
          Input  : Number of pages, job title qry
          Output : Array with raw html from all indeed pages 
      '''
        #print page, qry, location
        qry = {
            'q': '{0}'.format(qry),
            'sort': 'date',
            'start': page * 50,
            'limit': 50,
            'l': location
        }

        print "INDEED LOCALE", location, country
        if country == "Canada":
            urls = ["http://ca.indeed.com/jobs?" + urllib.urlencode(qry)]
        elif country == "USA":
            urls = ["http://www.indeed.com/jobs?" + urllib.urlencode(qry)]
        else:
            canadian_url = [
                "http://ca.indeed.com/jobs?" + urllib.urlencode(qry)
            ]
            american_url = [
                "http://www.indeed.com/jobs?" + urllib.urlencode(qry)
            ]
            urls = canadian_url + american_url

        pages = [Crawlera().get(url).text for url in urls]
        return pages
コード例 #4
0
    def _html_to_dict(self, url):
        #r = requests.get(url).text
        r = Crawlera().get(url).text
        print url
        try:
            company_name = BeautifulSoup(r).find('h1', {'itemprop': 'name'})
            company_name = company_name.find('strong').text
        except:
            return {"handle": url}
        address = BeautifulSoup(r).find('h1', {
            'itemprop': 'name'
        }).find('span').text
        city = BeautifulSoup(r).find('span', {
            'itemprop': 'addressLocality'
        }).text
        state = BeautifulSoup(r).find('span', {
            'itemprop': 'addressRegion'
        }).text
        postal_code = BeautifulSoup(r).find('span', {
            'itemprop': 'postalCode'
        }).text
        description = BeautifulSoup(r).find('article', {
            'itemprop': 'description'
        }).text.strip().replace('\nMore...', '')
        logo = BeautifulSoup(r).find('figure').find('img')['src']
        website = BeautifulSoup(r).find('li', {
            'class': 'website'
        }).find('a')['href'].split('gourl?')[-1]
        domain = "{}.{}".format(
            tldextract.extract(website).domain,
            tldextract.extract(website).tld)
        ''' Phone '''
        main = BeautifulSoup(r).find('li', {
            'class': 'phone'
        }).find('strong', {
            'class': 'primary'
        }).text
        numbers = BeautifulSoup(r).find('li', {'class': 'phone'}).findAll('li')
        nums = [number.find('span').text for number in numbers]
        names = [
            number.text.split(number.find('span').text)[0]
            for number in numbers
        ]
        numbers = dict(zip(names, nums))
        numbers['main'] = main

        _vars = [
            company_name, address, city, state, postal_code, description, logo,
            website, domain
        ]
        labels = [
            "name", "address", "city", "state", "postal_code", "description",
            "logo", "website", "domain"
        ]
        company = dict(zip(labels, _vars))
        company["numbers"] = numbers
        company["handle"] = url
        return company
コード例 #5
0
 def _html_to_dict(self, url):
     bs = BeautifulSoup(Crawlera().get(url).text)
     info = bs.find('div', {'class': 'ataglanz'})
     if info:
         info = info.text.split('\n')
     else:
         return {}
     info = dict([i.strip().split(': ') for i in info if ":" in i])
     logo = bs.find('div', {'class': 'profileLeft'}).find('img')['src']
     info['logo'] = logo
     info['description'] = bs.find('p', {'id': 'bio'}).text
     info['name'] = bs.find('hgroup').text
     info['handle'] = url
     return info
コード例 #6
0
 def _html(self, qry, page=1, location="", country=None):
     #qry, page = "inside sales", 1
     if country:
         location = location + " " + country
     qry = {
         'search': '{0}'.format(qry),
         'page': page,
         'location': location,
         'days': 1
     }
     _url = "https://jobs.ziprecruiter.com/candidate/search?{0}"
     _url = _url.format(urllib.urlencode(qry))
     print _url
     return BeautifulSoup(Crawlera().get(_url).text)
コード例 #7
0
 def _parse_article_html(self, objectId, url, industry_press=None):
     #html = requests.get(url).text
     html = Crawlera().get(url).text
     article = BeautifulSoup(html).find("div", {
         "id": "ReleaseContent"
     }).text
     #
     ps = [
         p.text.split("SOURCE ")[-1]
         for p in BeautifulSoup(r.text).find_all("p") if "SOURCE " in p.text
     ]
     company_name = ps[0]
     #q.enqueue(ClearSpark()._bulk_company_info, company_name)
     data = {"article": article, "company_name": company_name}
     if industry_press:
         r = Parse().update("IndustryPress", objectId, data)
     else:
         r = Parse().update("Press", objectId, data)
     print r.json()
コード例 #8
0
    def _html_to_dict(self, url):
        co = BeautifulSoup(Crawlera().get(url).text)
        name = co.find('span', {'itemprop': 'name'})
        description = co.find('p', {'itemprop': 'description'})
        address = co.find('div', {'itemprop': 'address'})
        phone = co.find('div', {'itemprop': 'telephone'})
        website = ""  #co.find('div',{'id':'detailsContainer'}).find('a')
        # TODO - figure out why this is not working

        _vars = [name, description, address, phone, website]
        _vars = [var.text.strip() if var else "" for var in _vars]
        labels = ["name", "description", "address", "phone", "website"]
        print website
        data = dict(zip(labels, _vars))
        if data["website"] != "":
            data['domain'] = "{}.{}".format(
                tldextract.extract(data["website"]).domain,
                tldextract.extract(data["website"]).tld)
        data['handle'] = url
        return data
コード例 #9
0
    def _html_to_dict(self, url):
        r = Crawlera().get(url).text
        company_name = BeautifulSoup(r).find('h1', {'class': 'biz-page-title'})
        industry = BeautifulSoup(r).find('span',
                                         {'class': 'category-str-list'})
        address = BeautifulSoup(r).find('address', {'itemprop': 'address'})
        phone = BeautifulSoup(r).find('span', {'itemprop': 'telephone'})
        website = BeautifulSoup(r).find('div', {'class': 'biz-website'})
        website = website.find('a') if website else None

        _vars = [company_name, industry, address, phone, website]
        _vars = [var.text.strip() if var else "" for var in _vars]
        labels = ["name", "industry", "address", "phone", "website"]
        data = dict(zip(labels, _vars))
        data["industry"] = [data["industry"]]
        print data
        if data["website"] != "":
            tld = tldextract.extract(self._remove_non_ascii(data["website"]))
            data['domain'] = "{}.{}".format(tld.domain, tld.tld)
        data["handle"] = url
        return data
コード例 #10
0
 def _html_to_dict(self, _url):
     url = _url
     bs = BeautifulSoup(Crawlera().get(url).text)
     name = bs.find('h1', {'itemprop': 'name'})
     name = name.text.split('Company ')[0] if name else ""
     telephone = bs.find('span', {'itemprop': 'telephone'})
     telephone = telephone.text if telephone else ""
     try:
         address = bs.find('p', {
             'itemprop': 'address'
         }).text.split(telephone)[0].strip()
     except:
         address = ""
     url = bs.find('p', {'itemprop': 'address'})
     url = url.find('a') if url else ""
     url = url.text if url else ""
     cols = ["name", "phone", "address", "website"]
     vals = [name, telephone, address, url]
     info = dict(zip(cols, vals))
     info['handle'] = _url
     if "website" in info.keys():
         tld = tldextract.extract(info["website"])
         info['domain'] = "{}.{}".format(tld.domain, tld.tld)
     return info
コード例 #11
0
 def _search(self, qry):
     #html = Google().cache("https://twitter.com/guidespark")
     qry = qry.replace(" ", "%20")
     url = "https://twitter.com/search?f=realtime&q={0}&src=typd"
     html = Crawlera()._get(url.format(qry)).text
     tw = BeautifulSoup(html)
     tweets = []
     for tweet in tw.find_all("div", {"class": "tweet"}):
         text = tweet.find("p", {"class": "tweet-text"})
         if text:
             text = text.text
         else:
             continue
         hashtags = [
             hashtag["href"] for hashtag in tweet.find_all(
                 "a", {"class": "twitter-hashtag"})
         ]
         mentions = [
             "twitter.com" + reply["href"]
             for reply in tweet.find_all("a", {"class": "twitter-atreply"})
         ]
         links = [
             link["href"] for link in tweet.find_all(
                 "a", {"class": "twitter-timeline-link"})
         ]
         photos = [
             img["src"] for img in tweet.find_all(
                 "img", {"class": "TwitterPhoto-mediaSource"})
         ]
         tweet = {
             "text":
             text,
             "hashtags":
             hashtags,
             "mentions":
             mentions,
             "links":
             links,
             "photos":
             photos,
             "name":
             tweet.find("strong", {
                 "class": "fullname"
             }).text,
             "handle":
             tweet.find("span", {
                 "class": "username"
             }).text,
             "profile_pic":
             tweet.find("img", {"class": "avatar"})["src"],
             "timestamp":
             tweet.find("span", {"class": "_timestamp"})["data-time"],
             "time_ago":
             tweet.find("span", {
                 "class": "_timestamp"
             }).text
         }
         tweets.append(tweet)
         #CompanyExtraInfoCrawl()._persist(tweet, "tweets")
     tweets = pd.DataFrame(tweets)
     Parse()._batch_df_create("Tweet", tweets)
     # TODO - find company_name + title from twitter
     return tweets