def enrich_person(self):
        try:
            # TODO: improve - run 3 searches - by full name, first name and last name. Check all results agains P.possible_names...
            url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query'
            query = 'query=%s&facetFilters=' % self.enrich_key
            payload = {
                "params": query,
                "apiKey": CrunchBaseScraperEngager.THE_KEY,
                "appID": CrunchBaseScraperEngager.APP_ID
            }
            headers = {
                'contentType': 'application/json; charset=utf-8',
                'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY,
                'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID
            }
            response = requests.post(url, json=payload, headers=headers)
            if response.status_code == 429:
                raise EngagementException(
                    "%s. Exceeded requests quota. Error: %s." %
                    (response.status_code, response.text),
                    fatal=True)
            if response.status_code != 200:
                raise EngagementException(
                    "%s. %s." % (response.status_code, response.text),
                    fatal=True)
            if response.json()['nbHits'] == 0:
                raise EngagementException(
                    "No hits returned when searching for %s." %
                    self.enrich_key)

            # Check how many matches we have (if any)
            matches = []
            for person in response.json().get('hits', []):
                if person.get('type', '') == 'Person' and person.get(
                        'person', False) and person.get('name',
                                                        '') == self.enrich_key:
                    matches.append(person)
            if len(matches) == 0:
                raise EngagementException(
                    "None of the hits match the person name we're searching for (%s)."
                    % self.enrich_key)
            if len(matches) > 1:
                raise EngagementException(
                    "Person name is ambiguous - got %d hits for %s. Not enriching."
                    % (len(matches), self.enrich_key))

            # Iterate over matches (currently we get here only if there's one, but in future we may want to refine match)
            for person in matches:
                # Grab name
                f, m, l = AcureRateUtils.tokenize_full_name(person['name'])
                self.set_data(P.FIRST_NAME, f)
                self.set_data(P.LAST_NAME, l)
                if m:
                    self.set_data(P.MIDDLE_NAME, m)

                # Grab person photo
                if 'logo_url' in person:
                    logo_url = person['logo_url']
                    self.add_data(P.PHOTOS, {
                        P.PHOTO_URL: logo_url,
                        P.PHOTO_SOURCE: 'crunchbase'
                    })

                # Grab location
                if 'location_name' in person:
                    self.add_data(P.LOCATIONS, person['location_name'])

                # Grab socials
                if 'permalink' in person:
                    self.set_data(P.CB_PERMALINK, person['permalink'])
                if 'url' in person:
                    self.set_data(P.CRUNCHBASE_URL, person['url'])
                if 'linkedin_url' in person:
                    self.set_data(P.LINKEDIN_URL, person['linkedin_url'])
                if 'twitter_url' in person:
                    self.set_data(P.TWITTER_URL, person['twitter_url'])

                # Grab current position
                title = None
                if 'title' in person:
                    title = person['title']

                company = None
                if 'organization_name' in person:
                    company = person['organization_name']
                if title and company:
                    current_job = {
                        P.JOB_CURRENT: True,
                        P.JOB_TITLE: title,
                        P.JOB_NAME: company
                    }
                    self.add_data(P.JOBS, current_job)
                    if AcureRateUtils.is_business(title):
                        self.logger.info('---->> %s - %s @ %s', person['name'],
                                         title, company)

                # Grab primary role
                if title is not None and company is not None:
                    role = '%s @ %s' % (title, company)
                    self.set_data(P.PRIMARY_ROLE, role)

                # Set as business as person was found in CB...
                self.set_data(P.BUSINESS, True)
                self.set_data(P.BUSINESS_REASON, 'appears in CB')

                # Investor?
                if 'n_investments' in person and person['n_investments'] > 0:
                    self.set_data(P.INVESTOR, True)
                    self.set_data(P.INVESTOR_REASON,
                                  '%s investments' % person['n_investments'])
                    self.logger.info('--==--==-->> Worth looking into %s',
                                     person['name'])
                # We found one person, we can break from loop
                # TODO: in the future, add the other persons we found to Queue for further enrichment
                break
            pass
        except Exception as e:
            self.logger.error(
                'Failed to set some properties on person %s. Returning partial. (exception: %s)',
                self.enriched_entity, e)
        return [P.FULL_NAME]
Ejemplo n.º 2
0
 def _get_person(self, full_name):
     f, m, l = AcureRateUtils.tokenize_full_name(full_name)
     q = {"deduced.first_name": f, "deduced.last_name": l}
     r = DBWrapper.get_persons(q, True)
     return AcureRatePerson().reconstruct(r) if r else None
Ejemplo n.º 3
0
    def enrich_person(self):
        permalink = self.enrich_key
        url = 'https://www.crunchbase.com/person/%s#/entity' % permalink

        #driver = webdriver.Firefox()
        driver = webdriver.Chrome(r'C:\Python353\browser_drivers\chromedriver')
        #driver.set_window_size(1120, 550)
        driver.implicitly_wait(11)  # seconds
        try:
            # Activate the driver
            driver.get(url)

            # If we got to here, keep the permalink and URL
            self.set_data(P.CB_PERMALINK, permalink)
            self.set_data(P.CRUNCHBASE_URL, url)

            # Get person name
            try:
                full_name = driver.find_element_by_id('profile_header_heading').text
                f, m, l = AcureRateUtils.tokenize_full_name(full_name)
                self.set_data(P.FIRST_NAME, f)
                self.set_data(P.LAST_NAME, l)
                if m:
                    self.set_data(P.MIDDLE_NAME, m)
                driver.implicitly_wait(2)  # seconds
            except:
                s = "Failed to enrich %s. Unable to locate name entity in page - %s - something went awry... dumping this crawl." % (permalink, url)
                raise EngagementException(s)

            # Get primary role
            try:
                content = driver.find_element_by_xpath('//dt[text()="Primary Role"]')
                role_str = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(P.PRIMARY_ROLE, role_str.replace('\n', ' '))
            except:
                pass

            # Get photo
            try:
                content = driver.find_element_by_class_name('logo-links-container')
                photo_url = content.find_element_by_css_selector("div > img").get_attribute("src")
                if "cb-default" not in photo_url:
                    self.add_data(P.PHOTOS, {P.PHOTO_URL: photo_url, P.PHOTO_SOURCE: 'crunchbase'})
            except:
                pass

            # Get dob
            try:
                content = driver.find_element_by_xpath('//dt[text()="Born:"]')
                dob = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(P.DOB, dob)
            except:
                pass

            # Get gender
            try:
                content = driver.find_element_by_xpath('//dt[text()="Gender:"]')
                gender = content.find_element_by_xpath("following-sibling::*[1]").text
                self.add_data(P.GENDER, gender)
            except:
                pass

            # Get location
            try:
                content = driver.find_element_by_xpath('//dt[text()="Location:"]')
                location = content.find_element_by_xpath("following-sibling::*[1]").text
                if location != "Unknown":
                    self.add_data(P.LOCATIONS, location)
            except:
                pass

            # Get web-site
            try:
                content = driver.find_element_by_xpath('//dt[text()="Website:"]').find_element_by_xpath("following-sibling::*[1]")
                website_url = content.find_element_by_css_selector('a').get_attribute("href")
                self.set_data(P.WEBSITE, website_url)
            except:
                pass

            # Get socials
            try:
                content = driver.find_element_by_xpath('//dt[text()="Social: "]').find_element_by_xpath("following-sibling::*[1]")
                social_links_elems = content.find_elements_by_tag_name('a')
                for e in social_links_elems:
                    social_type = e.get_attribute('data-icons')  # "facebook", "twitter", "linkedin", etc.
                    social_link = e.get_attribute('href')
                    if social_type == 'facebook':
                        self.set_data(P.FACEBOOK_URL, social_link)
                    elif social_type == 'twitter':
                        self.set_data(P.TWITTER_URL, social_link)
                    elif social_type == 'linkedin':
                        self.set_data(P.LINKEDIN_URL, social_link)
            except Exception as e:
                print(e)

            # Get person details (description)
            try:
                person_details_elem = driver.find_element_by_id('description')
                person_details_str = person_details_elem.text
                self.set_data(P.DESCRIPTION, person_details_str)
            except Exception as e:
                print(e)

            # Get current jobs
            try:
                for row in driver.find_elements_by_css_selector(".experiences .current_job"):
                    title = row.find_element_by_tag_name('h4').text
                    company = row.find_element_by_tag_name('h5').text
                    current_job = {P.JOB_CURRENT: True, P.JOB_TITLE: title, P.JOB_NAME: company}
                    self.add_data(P.JOBS, current_job)
            except Exception as e:
                print(e)

            # Get past jobs
            try:
                past_job_section = driver.find_element_by_css_selector(".experiences .past_job")
                for row in past_job_section.find_elements_by_css_selector(".info-row")[1:-1]:
                    cols = row.find_elements_by_css_selector(".cell")
                    started = cols[0].text
                    ended = cols[1].text
                    title = cols[2].text
                    company = cols[3].text
                    past_job = {P.JOB_STARTED: started, P.JOB_ENDED: ended, P.JOB_TITLE: title, P.JOB_NAME: company}
                    self.add_data(P.JOBS, past_job)
            except Exception as e:
                print(e)

            # Get advisory roles
            try:
                advisory_roles_section = driver.find_element_by_css_selector(".advisory_roles")
                for row in advisory_roles_section.find_elements_by_css_selector("li .info-block"):
                    company = row.find_element_by_tag_name('h4').text
                    role_started = row.find_elements_by_css_selector('h5')
                    role = role_started[0].text
                    started = role_started[1].text
                    advisory_job = {P.JOB_TITLE: role, P.JOB_NAME: company}
                    if started.strip() != '':
                        advisory_job[P.JOB_STARTED] = started
                    self.add_data(P.ADVISORY_JOBS, advisory_job)
            except Exception as e:
                print(e)

            # Get investments
            try:
                investments = []
                investors_tables = driver.find_elements_by_css_selector(".table.investors")
                if len(investors_tables) > 0:
                    investors_rows_elements = investors_tables[0].find_elements_by_tag_name("tr")
                    for investor_element in investors_rows_elements[1:]:  # we're skipping the header line
                        txt = investor_element.text
                        # We care only about personal investments, so we go in only if there's anywhere seed investment
                        if 'personal investment' in txt.lower():
                            cols = investor_element.find_elements_by_tag_name('td')
                            if cols[3].text == 'Personal Investment':
                                investments.append((cols[0].text, cols[1].text, cols[2].text))
                    self.set_data(P.INVESTMENTS, investments)
            except Exception as e:
                print(e)

            # Get education
            try:
                content = driver.find_element_by_class_name('education')
                education_elements = content.find_elements_by_css_selector("li > div")
                ed = {}
                for elem in education_elements:
                    institute_name = elem.find_element_by_css_selector('h4 > a').text
                    if institute_name != '':
                        ed[P.EDUCATION_INSTITUTE] = institute_name
                    degree = elem.find_element_by_css_selector('h5').text
                    if degree != '':
                        ed[P.EDUCATION_DEGREE] = degree
                    years = elem.text.replace(institute_name, '').replace(degree, '').strip()
                    if years != '':
                        ed[P.EDUCATION_YEARS] = years
                    self.add_data(P.EDUCATIONS, ed)
            except:
                pass

        except Exception as e:
            raise e

        driver.close()
        return [P.FULL_NAME]