Esempio n. 1
0
    def _parse_response(self, html, company_name, keyword=None):
        results = Google()._results_html_to_df(html)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword:
            results['score'] = [
                fuzz.partial_ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Esempio n. 2
0
    def _employees(self, company_name="", keyword=None):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword:
            results['score'] = [
                fuzz.partial_ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Esempio n. 3
0
    def _employees(self, company_name="", keyword=""):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"'
        args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        results = Google().search(qry, 10)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company]
        if keyword != "":
            results['score'] = [fuzz.ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]

        results = results[results.company_score > 64]
        results = results.drop_duplicates()
        data = {'data': results.to_dict('r'), 'company_name':company_name}
        CompanyExtraInfoCrawl()._persist(data, "employees", "")

        job = rq.get_current_job()
        print job.meta.keys()
        if "queue_name" in job.meta.keys():
          if RQueue()._has_completed(job.meta["queue_name"]):
            q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
        return results
Esempio n. 4
0
    def _employees(self, company_name="", keyword=None):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty: 
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company_name]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company_name]
        if keyword:
            results['score'] = [fuzz.partial_ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Esempio n. 5
0
    def _employees(self, company_name="", keyword="", api_key=""):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword, api_key
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword != "":
            results['score'] = [
                fuzz.ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]

        results = results[results.company_score > 64]
        results = results.drop_duplicates()
        results["company_id"] = api_key
        """"
        data = {'data': results.to_dict('r'), 'company_name':company_name}
        CompanyExtraInfoCrawl()._persist(data, "employees", "")

        job = rq.get_current_job()
        print job.meta.keys()
        if "queue_name" in job.meta.keys():
          if RQueue()._has_completed(job.meta["queue_name"]):
            q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
        """
        return results
Esempio n. 6
0
    def _parse_response(self, html, company_name, keyword=None):
        results = Google()._results_html_to_df(html)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty: 
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company_name]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company_name]
        if keyword:
            results['score'] = [fuzz.partial_ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results