コード例 #1
0
    def enrich_person__truecaller(self):

        cookie_str = '__cfduid=d2554926f77b4da885e17e2e453d5e4c71481629449; tcToken=eyJpdiI6ImdLOFwvcFUxWWc0NnZHVnA4WkNRYVZLXC9PNjRMUGtmeXUzTldMSzZNZ1RNYz0iLCJ2YWx1ZSI6Ilpvb21DWVVFN0ptdUNKTFl4QnhiRjdKMVB0WTFrTHliRk13dFwvZHFzTklrMXczdnZPeEgzU2RXdnliNVg0bGxISG1IZU15dlRmaDNcL2xIclJoSCtwZUE9PSIsIm1hYyI6IjljOGJmMWQ4NzJmMDIzMDNjMjEwM2U2MzUwY2IxZDcyZWJlZjJhNWUzNGM3NmNhNDEwYzc3MThkMGM5Nzc3NGIifQ%3D%3D; _gat=1; _ga=GA1.2.979500326.1481629451; tcSession=eyJpdiI6IlZ6U2hsaGd1VDRPRHF1V1dwNDFHNnpVbkN6OEpFMk5LeElleWJrdDZuSms9IiwidmFsdWUiOiJzXC9TWldPYXV5WXBlWTk5dEdEZmxlQWwwR29OUThhc2k0eFBqMmF0VFJjVTVtNVhKYmZLR3hHOFdjMFJzYVVuOVlBSERpbVNnK0RNcEk4ME9IV1dnVnc9PSIsIm1hYyI6IjE1OWIzODM0MTFkOWZmOWRlNWVkNDQ1ZjhjYWIwOTEwYmQwNzc2MDIyZmQwMTY0NjM1MTdhNjlhMzgxNjdhNWEifQ%3D%3D; XLBS3=XLBS1|WE/k1|WE/fD'
        #cookie_str ='__cfduid=d2554926f77b4da885e17e2e453d5e4c71481629449; tcToken=eyJpdiI6ImdLOFwvcFUxWWc0NnZHVnA4WkNRYVZLXC9PNjRMUGtmeXUzTldMSzZNZ1RNYz0iLCJ2YWx1ZSI6Ilpvb21DWVVFN0ptdUNKTFl4QnhiRjdKMVB0WTFrTHliRk13dFwvZHFzTklrMXczdnZPeEgzU2RXdnliNVg0bGxISG1IZU15dlRmaDNcL2xIclJoSCtwZUE9PSIsIm1hYyI6IjljOGJmMWQ4NzJmMDIzMDNjMjEwM2U2MzUwY2IxZDcyZWJlZjJhNWUzNGM3NmNhNDEwYzc3MThkMGM5Nzc3NGIifQ%3D%3D; __gads=ID=69a98525476c8ae1:T=1481638323:S=ALNI_MYwuM-deZPXbwoejhOrxXl3ejZJyw; XLBS3=XLBS3|WFAMh|WFABt; tcSession=eyJpdiI6IjBuRFdMb1lmejZhSUFzVWI2cnVCRWdSaHE0QzFjQVNrajZ3RDVidjRTYms9IiwidmFsdWUiOiJMTThFZmtTTkhjeVZSV1FtTWx5MG9xaW55SFp4ZzEyUWY5SFhjMDZZNkpORDllVVp5Y3RScUlpdFU2OGxnbzdEOHJHQzBqaUFtZVZLSk5icGRKc1wvT0E9PSIsIm1hYyI6IjYxOTczZjMyMTQ1NGU2MGM3NjMyOGE0YWYyMTE4Zjg5OWEzOTE5MDMwODQzYjNjOWUyOTYwMWU0MTU2MDZiYWIifQ%3D%3D; _ga=GA1.2.979500326.1481629451'

        headers = {
            'contentType': 'application/json; charset=utf-8',
            'Cookie': cookie_str
        }

        # Create request
        url = 'https://www.truecaller.com/throttle/reset/throttleSearch'
        with requests_cache.disabled():
            response = requests.post(url, headers=headers)
        if response.status_code != 200:
            raise EngagementException("%s. %s." %
                                      (response.status_code, response.text))

        url = 'https://www.truecaller.com/il/0504333102'
        with requests_cache.disabled():
            response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise EngagementException("%s. %s." %
                                      (response.status_code, response.text))

        pass
コード例 #2
0
    def _get_company_info(self, domain):

        try:
            url = '%s/%s' % (CircleBackEngager.BASE_URL, CircleBackEngager.COMPANY_SERVICE)
            company_name = self.enriched_entity.deduced.get('name', '<No-Company-Name>')
            domain = self.enriched_entity.deduced.get('domain', None)
            if domain is None:
                raise EngagementException("Domain property of company %s not found." % company_name, fatal=True)
            payload = {'domains': [domain]}
            headers = {'contentType': 'application/json; charset=utf-8', 'X-CB-ApiKey': CircleBackEngager.THE_KEY}
            response = requests.post(url, json=payload, headers=headers)
            if response.status_code == 429:
                raise EngagementException("%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True)
            if response.status_code >= 500:
                raise EngagementException("Server Error (%d). Error: %s." % (response.status_code, response.reason), fatal=True)
            if response.status_code != 200:
                raise EngagementException("%s. %s." % (response.status_code, response.text), fatal=True)
            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        return response.json()
コード例 #3
0
 def set_enrich_key(self):
     t = self.enriched_entity.__class__.__name__
     if t == 'AcureRatePerson':
         email = self.get_pivot_email()
         phone = None  # ToDo...
         fname = self.enriched_entity.deduced.get(P.FIRST_NAME, None)
         lname = self.enriched_entity.deduced.get(P.LAST_NAME, None)
         if email and fname and lname:
             self.enrich_key = "%s %s %s" % (email, fname, lname)
         elif email:
             self.enrich_key = email
         elif phone and fname and lname:
             self.enrich_key = "%s %s %s" % (phone, fname, lname)
         else:
             raise EngagementException(
                 "CircleBack - cannot engage. No properties avaialable to set enrich key"
             )
     elif t == 'AcureRateCompany':
         if C.DOMAIN not in self.enriched_entity.deduced:
             raise EngagementException(
                 "CircleBack - cannot engage - no domain property available as enrich key"
             )
         self.enrich_key = self.enriched_entity.deduced.get(C.DOMAIN)
     else:
         raise EngagementException(
             "CircleBack - cannot engage - cannot generate enrich key. Unknown entity type"
         )
コード例 #4
0
ファイル: pipl_engager.py プロジェクト: drorgarti/SatoriLab
 def _handle_pipl_api_errors(self, response):
     if response.status_code == 200:  # All is ok.
         return
     # Handle different errors. Documentation - https://www.fullcontact.com/developer/docs/
     if response.status_code == 403:
         raise EngagementException("403. Quota Exceeded!", True)
     elif response.status_code == 400:
         raise EngagementException("400. Bad request", True)
     elif response.status_code == 500:
         raise EngagementException("500. Server Error", True)
     else:
         raise EngagementException("%s. Pipl engage error: %s" %
                                   (response.status_code, response.text))
コード例 #5
0
    def _get_person_info(self):
        try:
            url = '%s/%s' % (WhitePagesEngager.BASE_URL,
                             WhitePagesEngager.FIND_PERSON_SERVICE)
            email = self.get_pivot_email()
            phone = None  # TODO: implement
            fname = self.enriched_entity.deduced.get('first_name', None)
            lname = self.enriched_entity.deduced.get('last_name', None)
            req_id = self.enriched_entity.aid if hasattr(
                self.enriched_entity, 'aid') else 'no-attr'

            # build the payload for the request
            if fname and lname:
                parametrized_url = '%s?api_key=%s&name=%s%%20%s&address.city=Melville&address.state_code=NY&address.country_code=US' %\
                                   (url, WhitePagesEngager.THE_KEY, fname, lname)
            else:
                return None
            # if email and fname and lname:
            #     parametrized_url = '%s?api_key=%s&firstname=%s&lastname=%s&email_address=%s' % (url, WhitePagesEngager.THE_KEY, fname, lname, email)

            response = requests.get(parametrized_url)
            if response.status_code == 403:
                raise EngagementException(
                    "%s. Forbidden. Error: %s." %
                    (response.status_code, response.text),
                    fatal=True)
            if response.status_code == 429:
                raise EngagementException(
                    "%s. Exceeded requests quota. Error: %s." %
                    (response.status_code, response.text),
                    fatal=True)
            if response.status_code >= 500:
                raise EngagementException(
                    "Server Error (%d). Error: %s." %
                    (response.status_code, response.reason),
                    fatal=True)
            if response.status_code != 200:
                raise EngagementException(
                    "%s. %s." % (response.status_code, response.text),
                    fatal=True)
            # if response.json()['nbHits'] == 0:
            #     raise EngagementException("No hits returned when searching for %s." % self.enrich_key)

            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        return response.json()
コード例 #6
0
    def enrich_company(self):

        result_obj = self._get_company_info()

        if 'pending' in result_obj and result_obj['pending']:
            msg = 'Failed to get information on person %s. Pending (202)' % self.enrich_key
            raise EngagementException(msg)

        if 'company' not in result_obj or result_obj['company'] is None:
            msg = 'Failed to get information on company %s. Not Found (404)' % self.enrich_key
            raise EngagementException(msg)

        enriched = False

        company_data = result_obj['company']

        return [C.NAME]
コード例 #7
0
 def set_enrich_key(self):
     phone = self.get_pivot_phone()
     if phone:
         self.enrich_key = "%s" % phone
     else:
         raise EngagementException(
             "OpenCnam Engager - cannot engage. Cannot create enrich key for %s",
             self.enriched_entity)
コード例 #8
0
 def _get_entity_by_type(self, entity_type):
     if entity_type == 'people':
         entity = AcureRatePerson()
     elif entity_type == 'company':
         entity = AcureRateCompany()
     else:
         raise EngagementException('Unknown entity type - %s', entity_type)
     return entity
コード例 #9
0
 def set_enrich_key(self):
     t = self.enriched_entity.__class__.__name__
     if t == 'AcureRatePerson':
         email = self.get_pivot_email()
         if email is None:
             raise EngagementException(
                 "FullContacts - cannot engage. No email available as enrich key"
             )
         self.enrich_key = email
     elif t == 'AcureRateCompany':
         if C.DOMAIN not in self.enriched_entity.deduced:
             raise EngagementException(
                 "FullContacts - cannot engage - no domain property to use as key"
             )
         self.enrich_key = self.enriched_entity.deduced.get(C.DOMAIN)
     else:
         raise EngagementException(
             "FullContacts - cannot engage - cannot generate enrich key. Unknown entity type"
         )
コード例 #10
0
    def _get_person_info(self):

        try:
            response = clearbit.Enrichment.find(email=self.enrich_key)
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        return response
コード例 #11
0
    def _get_person_info(self):
        try:
            url = '%s/%s' % (CircleBackEngager.BASE_URL, CircleBackEngager.PEOPLE_SERVICE)
            email = self.get_pivot_email()
            phone = None  # TODO: implement
            fname = self.enriched_entity.deduced.get('first_name', None)
            lname = self.enriched_entity.deduced.get('last_name', None)
            req_id = self.enriched_entity.aid if hasattr(self.enriched_entity, 'aid') else 'no-attr'

            # build the payload for the request
            match_request = {}
            if email and fname and lname:
                match_request = {'request_id': req_id, 'email': email, 'first_name': fname, 'last_name': lname}
            elif email:
                match_request = {'request_id': req_id, 'email': email}
            elif phone and fname and lname:
                match_request = {'request_id': req_id, 'phone_number': phone, 'first_name': fname, 'last_name': lname}
            else:
                return None

            # Build payload with the match requests. TODO: create more than one request (one with email, one with phone)
            payload = {'match_requests': [match_request]}
            headers = {'contentType': 'application/json; charset=utf-8',
                       'X-CB-ApiKey': CircleBackEngager.THE_KEY}
            response = requests.post(url, json=payload, headers=headers)
            if response.status_code == 429:
                raise EngagementException("%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True)
            if response.status_code >= 500:
                raise EngagementException("Server Error (%d). Error: %s." % (response.status_code, response.reason), fatal=True)
            if response.status_code != 200:
                raise EngagementException("%s. %s." % (response.status_code, response.text), fatal=True)
            # if response.json()['nbHits'] == 0:
            #     raise EngagementException("No hits returned when searching for %s." % self.enrich_key)

            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        return response.json()
コード例 #12
0
    def _get_company_info(self):

        try:
            response = clearbit.Company.find(domain=self.enrich_key,
                                             stream=True)
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        return response
コード例 #13
0
 def set_enrich_key(self):
     t = self.enriched_entity.__class__.__name__
     if t == 'AcureRatePerson' and P.FULL_NAME in self.enriched_entity.deduced:
         name = self.enriched_entity.deduced[P.FULL_NAME]
     elif t == 'AcureRateCompany' and C.NAME in self.enriched_entity.deduced:
         name = self.enriched_entity.deduced[C.NAME]
     else:
         raise EngagementException(
             "BloombergScraper - cannot engage - cannot generate enrich key. Entity type: %s",
             t)
     self.enrich_key = name
コード例 #14
0
 def set_enrich_key(self):
     t = self.enriched_entity.__class__.__name__
     if t == 'AcureRatePerson':
         if P.CB_PERMALINK in self.enriched_entity.deduced:
             self.enrich_key = self.enriched_entity.deduced[P.CB_PERMALINK]
         elif P.FULL_NAME in self.enriched_entity.deduced:
             name = self.enriched_entity.deduced[P.FULL_NAME]
             self.enrich_key = CrunchBaseEngager.formalize_permalink(name)
         else:
             raise EngagementException("CrunchBaseBot - cannot engage - cannot generate enrich key for person. No permalink or name", t)
     elif t == 'AcureRateCompany':
         if C.CRUNCHBASE_PERMALINK in self.enriched_entity.deduced:
             self.enrich_key = self.enriched_entity.deduced[C.CRUNCHBASE_PERMALINK]
         elif C.NAME in self.enriched_entity.deduced:
             name = self.enriched_entity.deduced[C.NAME]
             self.enrich_key = CrunchBaseEngager.formalize_permalink(name)
         else:
             raise EngagementException("CrunchBaseBot - cannot engage - cannot generate enrich key for company. No permalink or name", t)
     else:
         raise EngagementException("CrunchBaseBot - cannot engage - cannot generate enrich key. Entity type: %s", t)
コード例 #15
0
 def _deserialize(self, entity_type, _entity):
     if type(_entity) is str:
         entity_json = json.loads(_entity, object_hook=json_util.object_hook)
         if entity_type == 'people':
             entity = AcureRatePerson.reconstruct(entity_json)
         elif entity_type == 'company':
             entity = AcureRateCompany.reconstruct(entity_json)
         else:
             raise EngagementException('Unknown entity type - %s', entity_type)
     else:
         entity = _entity
     return entity
コード例 #16
0
 def _handle_fc_api_errors(self, response):
     if response.status_code == 200:  # All is ok.
         return
     # Handle different errors. Documentation - https://www.fullcontact.com/developer/docs/
     if response.status_code == 403:  # Quota exceeded - need special treatment
         raise EngagementException("403. Quota Exceeded.", True)
     elif response.status_code == 405 or response.status_code == 410 or response.status_code == 422:
         raise EngagementException(
             "%s. Invalid request sent to FC %s" %
             (response.status_code, response.text), True)
     elif response.status_code == 404:
         raise EngagementException(
             "404. Searched in the past 24 hours and nothing was found: %s"
             % response.text)
     elif response.status_code == 500 or response.status_code == 503:
         raise EngagementException(
             "%s. Transient errors in FC server. Possible maintenance/downtime. %s"
             % (response.status_code, response.text), True)
     elif response.status_code == 202:  # being processed...
         raise EngagementException(
             "202. Did not get info. Request is being processed. Return later."
         )
     else:
         raise EngagementException(
             "%s. Unknown error: %s" %
             (response.status_code, response.text), True)
コード例 #17
0
    def _get_company_info(self, domain):
        try:
            response = self.fc.api_get('company', **{'domain': domain})
            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
            self._handle_fc_api_errors(response)
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        json = response.json()
        return json
コード例 #18
0
    def enrich_person(self):

        if self.what_to_do == 'error':
            raise EngagementException('Test Engager throwing test exception.')
        elif self.what_to_do == 'change':
            self.set_data('shoe_size', 44)
            self.set_data('eyes_color', 'green')

            name_dbl = self.enriched_entity.deduced.get(
                P.FULL_NAME, '<Noname>') * 2
            self.add_data('emails', 'name_dbl%s' % '@nowhere.com')

        return ['show_size', 'eyes_color']
コード例 #19
0
ファイル: pipl_engager.py プロジェクト: drorgarti/SatoriLab
 def set_enrich_key(self):
     email = self.get_pivot_email()
     fname = self.enriched_entity.deduced.get(P.FIRST_NAME, None)
     lname = self.enriched_entity.deduced.get(P.LAST_NAME, None)
     if email and fname and lname:
         self.enrich_key = "%s %s %s" % (email, fname, lname)
     elif email:
         self.enrich_key = email
     elif fname and lname:
         self.enrich_key = "%s %s" % (fname, lname)
     else:
         raise EngagementException(
             "Pipl - cannot engage. Cannot create enrich key for %s",
             self.enriched_entity)
コード例 #20
0
    def _get_person_info(self):

        try:
            response = self.fc.api_get('person', **{'email': self.enrich_key})
            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
            self._handle_fc_api_errors(response)
            # TODO: check if we can inspect the header and see our limit remaining...
            #r.headers['x-rate-limit-remaining']
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        json = response.json()
        return json
コード例 #21
0
ファイル: pipl_engager.py プロジェクト: drorgarti/SatoriLab
    def get_info(self):

        email = self.get_pivot_email()
        fname = self.enriched_entity.deduced.get('first_name', None)
        lname = self.enriched_entity.deduced.get('last_name', None)

        # build the Search request
        # TODO: need to pass in my request the matching criteria: "(email and name)" or "email", etc.
        if email and fname and lname:
            payload = {
                'key': PiplEngager.THE_KEY,
                'email': email,
                'first_name': fname,
                'last_name': lname
            }
        elif email:
            payload = {'key': PiplEngager.THE_KEY, 'email': email}
        elif fname and lname:
            payload = {
                'key': PiplEngager.THE_KEY,
                'first_name': fname,
                'last_name': lname
            }
        else:
            return None

        # Set the match requirements
        payload['minimum_probability'] = 0.7
        payload['minimum_match'] = 1
        #payload['match_requirements'] = '(name and image)'

        try:
            # TODO: Look into header: {'X-APIKey-Quota-Current': '10'
            response = requests.get('https://api.pipl.com/search',
                                    params=payload)
            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
            self._handle_pipl_api_errors(response)
            if hasattr(response, 'from_cache') and not response.from_cache:
                pass
            json_response = json.loads(response.text)
        except EngagementException as e:
            raise e
        except Exception as e:
            raise EngagementException(e, True)

        return json_response
コード例 #22
0
    def launch(self, provider, entity_type, entity_string, force):
        # Instantiate the provider
        instance = self._instantiate_provider(provider)
        if instance is None:
            raise EngagementException('Aborting launch. Failed to instantiate provider %s' % provider)

        self.logger.info('Provider %s instantiated and ready.', provider)

        try:
            entity = self._deserialize(entity_type, entity_string)
            self.logger.info('About to launch an engagement via %s on %s', provider, entity)
            engagement_result = instance.engage(entity_type, entity, force)
        except EngagementException as e:
            self.logger.error('Exception raised: %s', e)
            engagement_result = None

        return engagement_result.to_json_string()
コード例 #23
0
    def enrich_person(self):

        result_obj = self._get_person_info()

        if 'pending' in result_obj and result_obj['pending']:
            msg = 'Failed to get information on person %s. Pending (202)' % self.enrich_key
            raise EngagementException(msg)

        if 'person' not in result_obj or result_obj['person'] is None:
            msg = 'Failed to get information on person %s. Not Found (404)' % self.enrich_key
            raise EngagementException(msg)

        enriched = False

        person_data = result_obj['person']

        # Get the name properties
        if 'name' in person_data:
            self.set_data(P.FIRST_NAME, person_data['name']['givenName'])
            self.set_data(P.LAST_NAME, person_data['name']['familyName'])
            self.set_data(P.FULL_NAME, person_data['name']['fullName'])

        if 'email' in person_data and person_data['email'] != self.enrich_key:
            self.set_data(P.EMAIL, person_data['email'])
            self.add_data(P.EMAILS, person_data['email'])

        if 'gender' in person_data and person_data['gender']:
            #enriched = True
            self.add_data(P.GENDER, person_data['gender'])

        if 'bio' in person_data and person_data['bio']:
            enriched = True
            self.add_data(P.SHORT_DESCRIPTION, person_data['bio'])

        if 'location' in person_data and person_data['location']:
            enriched = True
            self.add_data(P.LOCATIONS, person_data['location'])

        if 'facebook' in person_data and person_data['facebook']['handle']:
            enriched = True
            self.add_data(P.FACEBOOK_URL, person_data['facebook']['handle'])

        if 'linkedin' in person_data and person_data['linkedin']['handle']:
            enriched = True
            self.add_data(P.LINKEDIN_URL, result_obj['person']['linkedin'])

        if 'twitter' in person_data and person_data['twitter']['handle']:
            enriched = True
            self.add_data(P.TWITTER_URL, result_obj['person']['twitter'])

        if 'googleplus' in person_data and person_data['googleplus']['handle']:
            enriched = True
            self.add_data(P.GOOGLEPLUS_URL, result_obj['person']['googleplus'])

        if 'employment' in person_data:
            job = {}
            if person_data['employment'].get('name', None) is not None:
                job[P.JOB_NAME] = person_data['employment'].get('name', [])
            if person_data['employment'].get('title', None) is not None:
                job[P.JOB_TITLE] = person_data['employment'].get('title', [])
            if person_data['employment'].get('role', None) is not None:
                job[P.JOB_ROLE] = person_data['employment'].get('role', [])
            if job != {}:
                enriched = True
                self.add_data(P.JOBS, job)

        # TODO: gravatar, aboutme, github

        if not enriched:
            msg = 'Failed: no information added to person %s' % self.enrich_key
            raise EngagementException(msg)

        return [P.JOBS]
コード例 #24
0
    def enrich_company(self):

        try:
            name = self.enriched_entity.deduced['name']
            if not self.enrich_key:
                # Search for all the companies with this name
                response = self._make_request(CrunchBaseEngager.PEOPLE_URL,
                                              {'name': name})
                data = response.json().get('data')
                if not data or data.get('error'):
                    raise EngagementException(
                        "CrunchBaseEngager: error in retrieving company %s." %
                        name)
                if len(data["items"]) > 1:
                    raise EngagementException(
                        "CrunchBaseEngager: company %s not ambiguous. Found %d people with this name."
                        % (name, len(data["items"])))
                if len(data["items"]) == 0:
                    raise EngagementException(
                        "CrunchBaseEngager: company %s not found." % name)

                # TODO: Future: Iterate over the returned people and check if there's another matching attribute (like social url) we can use to choose the right person
                permalink = data['items'][0]['properties']['permalink']
            else:
                permalink = self.enrich_key

            response = self.get_node('organizations', permalink)
            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
                if not response.from_cache:
                    pass  # code for debugging purposes

            data = response.json().get('data')
            org = Organization(data)

            # Name
            if org.name:
                self.set_data(C.NAME, org.name)

            # Get company logo
            if org.primary_image and len(org.primary_image) > 0:
                logo_url = org.primary_image[0].asset_path
                self.add_data(C.LOGOS, {
                    C.LOGO_URL: logo_url,
                    C.LOGO_SOURCE: 'crunchbase'
                })

            # Get overview stats (acquisitions, total funds, etc.)
            if org.acquired_by and hasattr(org.acquired_by, 'acquirer'):
                acquiring_company = org.acquired_by.acquirer.name
                self.set_data(C.ACQUIRED_BY, acquiring_company)

            # Get headquarters
            if org.headquarters and len(org.headquarters) > 0:
                headquarters = '%s, %s' % (org.headquarters[0].city,
                                           org.headquarters[0].country)
                self.set_data(C.HEADQUARTERS, headquarters)

            # Get description
            if org.short_description:
                description = org.short_description
                self.set_data(C.DESCRIPTION, description)

            # Get founders
            if org.founders and len(org.founders) > 0:
                founders = []
                for founder in org.founders:
                    full_name = '%s %s' % (founder.first_name,
                                           founder.last_name)
                    founders.append(full_name)
                self.set_data(C.FOUNDERS, founders)

            # Get categories
            if org.categories and len(org.categories) > 0:
                for category in org.categories:
                    self.add_data(C.CATEGORIES, category.name)

            # Grab aliases
            if org.also_known_as:
                self.set_data(C.ALIASES, org.also_known_as)

            # Grab websites --> homepage_url ?
            if org.homepage_url and len(org.homepage_url) > 0:
                self.set_data(C.WEBSITE, org.homepage_url)

            # Is it a VC company
            if org.role_investor:
                self.set_data(C.INVESTMENT_COMPANY_TYPE,
                              C.ORGANIZATION_TYPE_VENTURE_CAPITAL)

            # Is it an educational organization
            if org.role_school:
                self.set_data(C.ORGANIZATION_TYPE, C.ORGANIZATION_TYPE_SCHOOL)

            # Get socials
            if org.websites and len(org.websites) > 0:
                for url in org.websites:
                    url_type = url.website_type.lower()
                    if url_type == 'twitter':
                        self.set_data(C.TWITTER_URL, url.url)
                    elif url_type == 'facebook':
                        self.set_data(C.FACEBOOK_URL, url.url)
                    elif url_type == 'linkedin':
                        self.set_data(C.LINKEDIN_URL, url.url)
                    elif url_type == 'angellist':
                        self.set_data(C.ANGELLIST_URL, url.url)
                    else:
                        pass

            # Get investments
            if org.investments and len(org.investments) > 0:
                all_investments = set()
                for investment in org.investments:
                    investment_name = investment.invested_in.name
                    all_investments.add(investment_name)
                self.set_data(C.PORTFOLIO_COMPANIES, list(all_investments))

            # Get founding year
            if org.founded_on:
                founding_year = org.founded_on.year
                self.set_data(C.FOUNDING_YEAR, founding_year)

            # Get contact email - for emails-domain info

            # Get number of employees
            if org.num_employees_min and org.num_employees_max:
                employees_range_str = '%s|%s' % (org.num_employees_min,
                                                 org.num_employees_max)
                self.set_data(C.EMPLOYEES_RANGE, employees_range_str)

            # Go over all investors
            if org.investors and len(org.investors) > 0:
                investors = []
                for investor in org.investors:
                    investor_dict = investor.data
                    investor_type = investor_dict['type'].lower()
                    if investor_type == 'person':
                        investor_name = '%s %s' % (
                            investor_dict['properties']['first_name'],
                            investor_dict['properties']['last_name'])
                    elif investor_type == 'organization':
                        investor_name = investor_dict['properties']['name']
                    else:
                        pass
                    str = 'partner/round'
                    investors.append((investor_name, investor_type, str))
                self.set_data(C.INVESTORS, investors)

            # Go over all board members
            if org.board_members_and_advisors and len(
                    org.board_members_and_advisors) > 0:
                board_members = []
                for board_member in org.board_members_and_advisors:
                    # Do we need this: board_member.person.role_investor:
                    full_name = '%s %s' % (board_member.person.first_name,
                                           board_member.person.last_name)
                    board_members.append(full_name)
                self.set_data(C.ADVISORS, board_members)

            if org.founders and len(org.founders) > 0:
                founders = []
                for founder in org.founders:
                    # Do we need thjs: founder.role_investor:
                    full_name = founder.first_name + " " + founder.last_name
                    founders.append(full_name)
                self.set_data(C.FOUNDERS, founders)

            team_members = []
            if org.past_team and len(org.past_team) > 0:
                for team_member in org.past_team:
                    full_name = team_member.person.first_name + " " + team_member.person.last_name
                    team_members.append(full_name)

            if org.current_team and len(org.current_team) > 0:
                for team_member in org.current_team:
                    full_name = team_member.person.first_name + " " + team_member.person.last_name
                    team_members.append(full_name)

            if len(team_members) > 0:
                self.set_data(C.TEAM, team_members)
            pass

            # Only if data was not found, get the companies by names
            # data = self.cb.organizations(company_name)
            # if 'items' in data:
            #     permalink = data.items[0].permalink
            #     self.set_data("permalink", permalink)
            #     response = self.get_node('organizations', permalink)
            #     node_data = response.json().get('data')
            #     # Add the company name and other information (and then I can go to sleep!
            #     pass

        except Exception as e:
            print(
                "CrunchBasengager::enrich_company - failed to enrich company %s (%s)"
                % (name, e))
            raise EngagementException(e)

        return [C.NAME]
コード例 #25
0
    def enrich_person(self):
        try:
            if not self.enrich_key:
                # Search for all the people with this name
                name = self.enriched_entity.deduced[
                    'first_name'] + " " + self.enriched_entity.deduced[
                        'last_name']
                response = self._make_request(CrunchBaseEngager.PEOPLE_URL,
                                              {'name': name})
                data = response.json().get('data')
                if not data or data.get('error'):
                    raise EngagementException(
                        "CrunchBaseEngager: error in retrieving person %s." %
                        name)
                if len(data["items"]) > 1:
                    raise EngagementException(
                        "CrunchBaseEngager: person %s not ambiguous. Found %d people with this name."
                        % (name, len(data["items"])))
                if len(data["items"]) == 0:
                    raise EngagementException(
                        "CrunchBaseEngager: person %s not found." % name)

                # TODO: Future: Iterate over the returned people and check if there's another matching attribute (like social url) we can use to choose the right person
                permalink = data['items'][0]['properties']['permalink']
            else:
                permalink = self.enrich_key

            # Get information on person via permalink
            response = self._make_request(
                'https://api.crunchbase.com/v/3/people/' + permalink)
            if hasattr(response, 'from_cache'):
                self.set_data("from_cache", response.from_cache)
                if not response.from_cache:
                    pass  # code for debugging purposes
            data = response.json().get('data')
            people = Person(data)

            # TODO: deal with marking from cache... now I'm ignoring it

            # Keep the key email we used for the search
            #self.set_data("search_key", name)

            if people.data and 'relationships' in people.data and 'investments' in people.data[
                    'relationships']:
                for elem in people.data['relationships']['investments'][
                        'items']:
                    pass

            if people.data and 'relationships' in people.data and 'advisory_roles' in people.data[
                    'relationships']:
                for elem in people.data['relationships']['advisory_roles'][
                        'items']:
                    try:
                        job_title = elem["properties"]["title"]
                        company_name = elem["relationships"]["organization"][
                            "properties"]["name"]
                        self.add_data(P.ADVISORY_JOBS, {
                            P.JOB_TITLE: job_title,
                            P.JOB_NAME: company_name
                        })
                    except Exception as e:
                        print('Unable to get advisory roles for %s' %
                              permalink)

            if people.data and 'properties' in people.data and 'gender' in people.data[
                    'properties']:
                self.set_data(P.GENDER, people.data['properties']['gender'])

            if people.data and 'properties' in people.data and 'bio' in people.data[
                    'properties']:
                self.set_data(P.SHORT_DESCRIPTION,
                              people.data['properties']['bio'])

            if people.born_on:
                self.set_data(P.DOB, people.born_on)

            if people.degrees:
                for degree in people.degrees.items:
                    if degree.school and degree.school.name:
                        education = {}
                        education[P.EDUCATION_INSTITUTE] = degree.school.name
                        degree_years = None
                        if degree.started_on:
                            degree_years = '%s' % degree.started_on.year
                        if degree.started_on and degree.completed_on:
                            degree_years = '%s-%s' % (degree.started_on.year,
                                                      degree.completed_on.year)
                        if degree_years:
                            education[P.EDUCATION_YEARS] = degree_years
                        if degree.degree_type_name:
                            education[
                                P.EDUCATION_DEGREE] = degree.degree_type_name
                        if degree.degree_subject:
                            education[
                                P.EDUCATION_SUBJECT] = degree.degree_subject
                        self.add_data(P.EDUCATIONS, education)

            if people.jobs:
                for job in people.jobs:
                    if job.data['type'] == 'Job':
                        j = {}
                        if job.title:
                            j["job_title"] = job.title
                        org_type = job.data['relationships']['organization'][
                            'type']
                        if org_type and org_type == 'Organization':
                            if job.data['relationships']['organization'][
                                    'properties']['name']:
                                j["job_name"] = job.data['relationships'][
                                    'organization']['properties']['name']
                        else:
                            pass
                        if job.data['properties']['started_on']:
                            j["started_on"] = job.data['properties'][
                                'started_on']
                        if job.data['properties']['ended_on']:
                            j["ended_on"] = job.data['properties']['ended_on']
                        if len(j) > 0:
                            self.add_data(P.JOBS, j)
                    else:
                        pass
            if len(people.founded_companies.items) > 0:
                for c in people.founded_companies.items:
                    self.add_data("founded_companies", c.name)

            pass
        except Exception as e:
            print("CrunchBaseEngager failed to enrich person (name: %s)" %
                  name)
            if "quota" in str(e).lower():
                pass
            raise EngagementException(e)

        return [P.FULL_NAME]
コード例 #26
0
    def enrich_person(self):
        try:
            if P.BLOOMBERG_URL not in self.enriched_entity.deduced:
                # Search google for the person - the search string: 'site:bloomberg.com ploni almoni "executive profile"'
                url_prefix_1 = 'http://www.bloomberg.com/research/stocks/private/person.asp?personId='.lower(
                )
                url_prefix_2 = 'http://www.bloomberg.com/research/stocks/people/person.asp?personId='.lower(
                )
                query = 'site:bloomberg.com "%s" "executive profile"' % self.enrich_key
                res = search(query,
                             tld='com',
                             lang='en',
                             num=3,
                             start=0,
                             stop=2,
                             pause=2.0)
                matches = 0
                for url in res:
                    url_lower = url.lower().replace('https', 'http')
                    if url_lower.find(url_prefix_1) == 0 or url_lower.find(
                            url_prefix_2) == 0:
                        matches += 1
                if matches == 0:
                    raise EngagementException(
                        'Unable to locate information in Bloomberg.com on %s' %
                        self.enrich_key)
                elif matches > 1:
                    # TODO: we can improve search that will also consult working places and determine which person is the one we need... (try: Ariel Cohen)
                    raise EngagementException(
                        'Unable to locate information in Bloomberg.com - more than one match on %s'
                        % self.enrich_key)

                # Grab person id from url
                p = re.compile(r'asp\?personId=(\d+)&')
                person_id = p.search(url).group(1)
                self.set_data(P.BLOOMBERG_ID, person_id)
                # TODO: look into the full url google returns - what is capId?
                self.set_data(P.BLOOMBERG_URL, url)
            else:
                url = self.enriched_entity.deduced[P.BLOOMBERG_URL]

            # Get the person's page for parsing
            response = requests.get(url)
            if response.status_code != 200:
                s = 'Unable to load page in Bloomberg.com on %s. Error: %s. (url=%s)' % (
                    self.enrich_key, response.status_code, url)
                raise EngagementException(s)

            soup = BeautifulSoup(response.content, 'html.parser')

            # Get age
            try:
                td_elem = soup.find("td", string='Age')
                tr_elem = td_elem.parent
                tr_elem2 = tr_elem.next_sibling
                td_elem2 = tr_elem2.find("td")
                age = td_elem2.text
                if age != "--":
                    self.set_data(P.DOB, "%s years old" % age)
            except:
                self.logger.warning(
                    'Unable to locate job title/name attribute for %s',
                    self.enrich_key)

            # Get current job
            try:
                job = {}
                elem = soup.find("span", {"itemprop": "jobTitle"})
                if elem:
                    job_title = elem.text
                    if len(job_title.strip()) > 0:
                        job[P.JOB_TITLE] = job_title
                elem = soup.find("a", {"itemprop": "worksFor"})
                if elem:
                    job_name = elem.text
                    if len(job_name.strip()) > 0:
                        job[P.JOB_NAME] = job_name
                if len(job) > 0:
                    self.add_data(P.JOBS, job)
            except:
                self.logger.warning(
                    'Unable to locate job title/name attribute for %s',
                    self.enrich_key)

            # Get person's description
            try:
                elem1 = soup.find("div", {"itemprop": "description"})
                elem2 = soup.find("p", {"itemprop": "description"})
                description = None
                if elem1:
                    description = elem1.text
                elif elem2:
                    description = elem2.text
                if description:
                    description = description.replace('\n', '').replace(
                        'Read Full Background', '').strip()
                    self.set_data(P.DESCRIPTION, description)
            except:
                self.logger.warning(
                    'Unable to locate description attribute for %s',
                    self.enrich_key)

            # Get the board positions
            try:
                h2_elems = [
                    soup.findAll('h2',
                                 text=re.compile('Corporate Headquarters'))
                ]
                #h2_elems = [soup.find("h2", string='Corporate Headquarters')]
                #h2_elems += h2_elems[0].find_next_siblings("h2")
                for elem in h2_elems:
                    if elem.text.startswith('Board Members Memberships'):
                        for e in elem.next_siblings:
                            if 'no Board Members' in e:
                                break
                            if e.name == "h2":
                                break
                            if e.name == "div" and e.find("a") is not None:
                                company_name = e.find("a").text
                                # TODO: pick up the word 'Director' - or? what else is there to be...?
                                self.add_data(
                                    P.ADVISORY_JOBS, {
                                        P.JOB_NAME: company_name,
                                        P.JOB_TITLE: 'Director'
                                    })
            except:
                self.logger.warning(
                    'Unable to locate board positions information for %s',
                    self.enrich_key)

            try:
                # Get the education organizations
                education_elems = soup.find_all("div",
                                                {"itemprop": "alumniOf"})
                institutes_names = []
                for e in education_elems:
                    # TODO: extract the Degree & Years, if available
                    institutes_names.append(e.text)
                    self.add_data(P.EDUCATIONS,
                                  {P.EDUCATION_INSTITUTE: e.text})
            except:
                self.logger.warning(
                    'Unable to locate other affiliations information for %s',
                    self.enrich_key)

            try:
                # Get the other companies he worked for
                companies_elems = soup.find_all("a",
                                                {"itemprop": "affiliation"})
                for e in companies_elems:
                    if e.text not in institutes_names and len(
                            e.text.strip()) > 0:
                        self.add_data(P.JOBS, {P.JOB_NAME: e.text})
            except:
                self.logger.warning(
                    'Unable to locate other affiliations information for %s',
                    self.enrich_key)

        except Exception as e:
            self.logger.error('Unable to enrich person %s. %s',
                              self.enriched_entity, e)
            raise e
        return [P.FULL_NAME]
コード例 #27
0
    def _company_exists(company_name, cb_url=None, permalink=None):
        # Issue a request to CB search server - if matches exist, compare using name or cb_url if provided.
        try:
            # Truncate possible parameters on url
            if cb_url and cb_url.find('?') > 0:
                cb_url = cb_url[:cb_url.index('?')]

            company_name_clean = AcureRateUtils.clean_company_name(
                company_name)

            url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query'
            query = 'query=%s&facetFilters=' % company_name_clean.replace(
                '&', '%26')
            payload = {
                "params": query,
                "apiKey": CrunchBaseScraperEngager.THE_KEY,
                "appID": CrunchBaseScraperEngager.APP_ID
            }
            headers = {
                'contentType': 'application/json; charset=utf-8',
                'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY,
                'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID
            }
            with requests_cache.disabled():
                response = requests.post(url, json=payload, headers=headers)
            # @@@ fatal
            if response.status_code == 429:
                raise EngagementException(
                    "%s. Exceeded requests quota. Error: %s." %
                    (response.status_code, response.text),
                    fatal=True)
            if response.status_code != 200:
                raise EngagementException(
                    "%s. %s." % (response.status_code, response.text),
                    fatal=True)
            if response.json()['nbHits'] == 0:
                raise EngagementException(
                    "CrunchBaseScraper: No hits returned when searching for %s (%s)."
                    % (company_name_clean, company_name))

            # Check how many matches we have (if any)
            matches = []
            for company in response.json().get('hits', []):
                if company.get('type', '') == 'Organization' and company.get(
                        'organization', False) and 'name' in company:
                    if 'permalink' in company and permalink and company[
                            'permalink'].lower() == permalink:
                        matches.append(company)
                        break
                    # Compare URLs
                    if 'url' in company and cb_url and cb_url.endswith(
                            company['url']):
                        matches.append(company)
                        break
                    # Check by name
                    result_company_name_clean = AcureRateUtils.clean_company_name(
                        company.get('name'))
                    if result_company_name_clean.lower(
                    ) == company_name_clean.lower():
                        matches.append(company)
            if len(matches) == 0:
                raise EngagementException(
                    "CrunchBaseScraper: No match for %s (%s)" %
                    (company_name_clean, company_name))
            if len(matches) > 1:
                raise EngagementException(
                    "CrunchBaseScraper: Ambiguous results - got %d hits for %s (%s)"
                    % (len(matches), company_name_clean, company_name))
        except Exception as e:
            raise e
        return matches
コード例 #28
0
    def enrich_company(self):
        try:
            if C.BLOOMBERG_URL not in self.enriched_entity.deduced:
                # Search google for the person - the search string: 'site:bloomberg.com ploni almoni "executive profile"'
                # url_prefix_1 = 'http://www.bloomberg.com/research/stocks/private/person.asp?personId='.lower()
                # url_prefix_2 = 'http://www.bloomberg.com/research/stocks/people/person.asp?personId='.lower()
                url_prefix_1 = 'http://www.bloomberg.com/research/stocks/private/snapshot.asp?privcapId='.lower(
                )
                url_prefix_2 = 'http://something something'.lower()
                query = 'site:bloomberg.com snapshot "%s"' % self.enrich_key
                res = search(query,
                             tld='com',
                             lang='en',
                             num=3,
                             start=0,
                             stop=2,
                             pause=2.0)
                matches = 0
                for url in res:
                    url_lower = url.lower().replace('https', 'http')
                    if url_lower.find(url_prefix_1) == 0 or url_lower.find(
                            url_prefix_2) == 0:
                        matches += 1
                if matches == 0:
                    raise EngagementException(
                        'Unable to locate information in Bloomberg.com on %s' %
                        self.enrich_key)
                elif matches > 1:
                    # TODO: we can improve search that will also consult working places and determine which person is the one we need... (try: Ariel Cohen)
                    raise EngagementException(
                        'Unable to locate information in Bloomberg.com - more than one match on %s'
                        % self.enrich_key)

                # Grab person id from url
                p = re.compile(r'asp\?personId=(\d+)&')
                person_id = p.search(url).group(1)
                self.set_data(P.BLOOMBERG_ID, person_id)
                # TODO: look into the full url google returns - what is capId?
                self.set_data(P.BLOOMBERG_URL, url)
            else:
                url = self.enriched_entity.deduced[P.BLOOMBERG_URL]

            # Get the person's page for parsing
            response = requests.get(url)
            if response.status_code != 200:
                s = 'Unable to load page in Bloomberg.com on %s. Error: %s. (url=%s)' % (
                    self.enrich_key, response.status_code, url)
                raise EngagementException(s)

            soup = BeautifulSoup(response.content, 'html.parser')

            # Get company's overview
            try:
                elem1 = soup.find("div", {"itemprop": "description"})
                elem2 = soup.find("p", {"itemprop": "description"})
                description = None
                if elem1:
                    description = elem1.text
                elif elem2:
                    description = elem2.text
                if description:
                    description = description.replace('\n', '').replace(
                        'Read Full Background', '').strip()
                    self.set_data(C.DESCRIPTION, description)
            except:
                self.logger.warning(
                    'Unable to locate company overview attribute for %s',
                    self.enrich_key)

            # Get key executives
            try:
                elems = soup.findAll("a", {"itemprop": "member"})
                for elem in elems:
                    name = elem.text.replace('Mr.', '').strip()
                    name_tokens = name.split(' ')
                    the_name = name
                    if len(name_tokens) == 3:
                        the_name = name_tokens[0] + ' ' + name_tokens[2]
                    elif len(name_tokens) != 2:
                        the_name = name
                        self.logger.warning(
                            'Not sure how many tokens are in this name - %s' %
                            name)
                    self.add_data(C.TEAM, the_name)
            except Exception as e:
                self.logger.warning(
                    'Unable to locate company executives for %s (%s)' %
                    (self.enrich_key, e))

            # Get phones
            # TODO...

            # Get domain
            try:
                elem = soup.find("a", {"itemprop": "url"})
                domain = elem.text
                self.set_data(C.DOMAIN, domain)
            except:
                self.logger.warning('Unable to locate domain attribute for %s',
                                    self.enrich_key)

            # Get address
            # TODO...

            # Get founding year
            # try:
            #     elem = soup.find("div", {"itemprop": "address"})
            #     elem2 = elem.find("p")  # This is currently WRONG - need to find the next sibling of elem
            #     founding_year = elem2.text
            #     self.set_data(C.FOUNDING_YEAR, founding_year)
            # except:
            #     self.logger.warning('Unable to locate founding year attribute for %s', self.enrich_key)

        except Exception as e:
            self.logger.error('Unable to enrich company %s. %s',
                              self.enriched_entity, e)
            raise e

        return [C.NAME]
コード例 #29
0
    def enrich_person(self):
        try:
            # TODO: improve - run 3 searches - by full name, first name and last name. Check all results agains P.possible_names...
            url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query'
            query = 'query=%s&facetFilters=' % self.enrich_key
            payload = {
                "params": query,
                "apiKey": CrunchBaseScraperEngager.THE_KEY,
                "appID": CrunchBaseScraperEngager.APP_ID
            }
            headers = {
                'contentType': 'application/json; charset=utf-8',
                'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY,
                'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID
            }
            response = requests.post(url, json=payload, headers=headers)
            if response.status_code == 429:
                raise EngagementException(
                    "%s. Exceeded requests quota. Error: %s." %
                    (response.status_code, response.text),
                    fatal=True)
            if response.status_code != 200:
                raise EngagementException(
                    "%s. %s." % (response.status_code, response.text),
                    fatal=True)
            if response.json()['nbHits'] == 0:
                raise EngagementException(
                    "No hits returned when searching for %s." %
                    self.enrich_key)

            # Check how many matches we have (if any)
            matches = []
            for person in response.json().get('hits', []):
                if person.get('type', '') == 'Person' and person.get(
                        'person', False) and person.get('name',
                                                        '') == self.enrich_key:
                    matches.append(person)
            if len(matches) == 0:
                raise EngagementException(
                    "None of the hits match the person name we're searching for (%s)."
                    % self.enrich_key)
            if len(matches) > 1:
                raise EngagementException(
                    "Person name is ambiguous - got %d hits for %s. Not enriching."
                    % (len(matches), self.enrich_key))

            # Iterate over matches (currently we get here only if there's one, but in future we may want to refine match)
            for person in matches:
                # Grab name
                f, m, l = AcureRateUtils.tokenize_full_name(person['name'])
                self.set_data(P.FIRST_NAME, f)
                self.set_data(P.LAST_NAME, l)
                if m:
                    self.set_data(P.MIDDLE_NAME, m)

                # Grab person photo
                if 'logo_url' in person:
                    logo_url = person['logo_url']
                    self.add_data(P.PHOTOS, {
                        P.PHOTO_URL: logo_url,
                        P.PHOTO_SOURCE: 'crunchbase'
                    })

                # Grab location
                if 'location_name' in person:
                    self.add_data(P.LOCATIONS, person['location_name'])

                # Grab socials
                if 'permalink' in person:
                    self.set_data(P.CB_PERMALINK, person['permalink'])
                if 'url' in person:
                    self.set_data(P.CRUNCHBASE_URL, person['url'])
                if 'linkedin_url' in person:
                    self.set_data(P.LINKEDIN_URL, person['linkedin_url'])
                if 'twitter_url' in person:
                    self.set_data(P.TWITTER_URL, person['twitter_url'])

                # Grab current position
                title = None
                if 'title' in person:
                    title = person['title']

                company = None
                if 'organization_name' in person:
                    company = person['organization_name']
                if title and company:
                    current_job = {
                        P.JOB_CURRENT: True,
                        P.JOB_TITLE: title,
                        P.JOB_NAME: company
                    }
                    self.add_data(P.JOBS, current_job)
                    if AcureRateUtils.is_business(title):
                        self.logger.info('---->> %s - %s @ %s', person['name'],
                                         title, company)

                # Grab primary role
                if title is not None and company is not None:
                    role = '%s @ %s' % (title, company)
                    self.set_data(P.PRIMARY_ROLE, role)

                # Set as business as person was found in CB...
                self.set_data(P.BUSINESS, True)
                self.set_data(P.BUSINESS_REASON, 'appears in CB')

                # Investor?
                if 'n_investments' in person and person['n_investments'] > 0:
                    self.set_data(P.INVESTOR, True)
                    self.set_data(P.INVESTOR_REASON,
                                  '%s investments' % person['n_investments'])
                    self.logger.info('--==--==-->> Worth looking into %s',
                                     person['name'])
                # We found one person, we can break from loop
                # TODO: in the future, add the other persons we found to Queue for further enrichment
                break
            pass
        except Exception as e:
            self.logger.error(
                'Failed to set some properties on person %s. Returning partial. (exception: %s)',
                self.enriched_entity, e)
        return [P.FULL_NAME]
コード例 #30
0
    def enrich_person(self):

        # Extract Twitter screen name
        screen_name = self.enriched_entity.deduced.get(P.TWITTER_SCREEN_NAME,
                                                       None)
        if not screen_name:
            url = self.enrich_key
            screen_name = self._extract_screenname_from_url(url)
        if not screen_name:  # If no screenname, search with Twitter using full name
            # Get all my job information to cross it against matches
            query = self.enriched_entity.deduced[P.FULL_NAME]
            possible_users = [
                user for user in tweepy.Cursor(self._api.search_users,
                                               q=query).items(10)
            ]  # TODO: why 10?
            for u in possible_users:
                job = AcureRateJob.attempt_parse(u.description)
                if self.enriched_entity.fuzzy_match_on_jobs(job):
                    screen_name = u.screen_name
                    self.logger.info(
                        'Located Twitter screen_name from %s matches. User = %s',
                        len(possible_users), str(u))
                    break

        if not screen_name:
            raise EngagementException(
                'Unable to enrich via Twitter. No twitter url/screenname.')

        try:
            # Keep Screen Name
            self.set_data(P.TWITTER_SCREEN_NAME, screen_name)

            # Pull info from Twitter
            user = self._api.get_user(screen_name)

            # Get user information
            self.set_data(P.FULL_NAME, user.name)
            self.set_data(P.TWITTER_FOLLOWERS_COUNT, user.followers_count)
            self.set_data(P.TWITTER_FRIENDS_COUNT, user.friends_count)
            self.set_data(P.TWITTER_LISTED_COUNT, user.listed_count)
            self.set_data(P.TWITTER_FAVOURITES_COUNT, user.favourites_count)
            self.set_data(P.TWITTER_STATUSES_COUNT, user.statuses_count)
            self.set_data(P.TWITTER_ACCOUNT_CREATION_DATE,
                          str(user.created_at))

            # Get description
            self.set_data(P.SHORT_DESCRIPTION, user.description
                          )  # TODO: need to deal with URLs (grab them too)

            # Is Investor
            # TODO: refine this. We cannot rely only on the word 'investment'. Use NLTK.
            desc = user.description.lower()
            if 'investment' in desc or 'investor' in desc or 'investing' in desc:
                self.set_data(P.INVESTOR, True)
                self.set_data(P.INVESTOR_REASON,
                              'Twitter: %s:' % user.description)

            # Get location
            self.add_data(P.LOCATIONS, user.location)

            # Get photo
            self.add_data(P.PHOTOS, {
                P.PHOTO_URL: user.profile_image_url,
                P.PHOTO_SOURCE: 'twitter'
            })

            # Assimilate the display urls into the description
            desc = self._assemble_description(
                user.description, user.entities['description']['urls'])
            if desc:
                self.set_data(
                    P.SHORT_DESCRIPTION,
                    desc)  # TODO: need to deal with URLs (grab them too)

            # Get all the urls a person may add to his twitter profile
            the_urls = set()
            if 'description' in user.entities:
                for url in user.entities['description'].get('urls', []):
                    the_urls.add(url['expanded_url'])
            if 'url' in user.entities:
                for url in user.entities['url'].get('urls', []):
                    the_urls.add(url['expanded_url'])
            for url in the_urls:
                self.add_data(P.RELATED_URLS, {
                    P.RELATED_URL_SOURCE: 'Twitter',
                    P.RELATED_URL_VALUE: url
                })

            # user.entities['url']

            # Get "followers" (those who 'stock' a person) and "friends" (following) - person chose to do it
            if TwitterEngager.EXTRACT_FF:
                paged_users = self._get_followers(screen_name)
                self.set_data(P.TWITTER_FOLLOWERS, paged_users)
                paged_users = self._get_friends(screen_name)
                self.set_data(P.TWITTER_FRIENDS, paged_users)

        except Exception as e:
            self.logger.error('Error raised during enrichment via twitter. %s',
                              e)

        return [P.TWITTER_SCREEN_NAME, P.DESCRIPTION]