Esempio n. 1
0
    def _parse_response(self, html, company_name, keyword=None):
        results = Google()._results_html_to_df(html)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword:
            results['score'] = [
                fuzz.partial_ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Esempio n. 2
0
    def _employees(self, company_name="", keyword=""):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"'
        args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        results = Google().search(qry, 10)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company]
        if keyword != "":
            results['score'] = [fuzz.ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]

        results = results[results.company_score > 64]
        results = results.drop_duplicates()
        data = {'data': results.to_dict('r'), 'company_name':company_name}
        CompanyExtraInfoCrawl()._persist(data, "employees", "")

        job = rq.get_current_job()
        print job.meta.keys()
        if "queue_name" in job.meta.keys():
          if RQueue()._has_completed(job.meta["queue_name"]):
            q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
        return results
Esempio n. 3
0
    def _employees(self, company_name="", keyword=None):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword:
            results['score'] = [
                fuzz.partial_ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Esempio n. 4
0
 def _related(self, domain, api_key="", name=""):
     companies = Google().search("related:{0}".format(domain), 10)
     companies = companies.drop_duplicates()
     companies.columns = ['link','description','title','lol','lmao']
     data = {'data':companies.to_dict('r'),"domain":domain,"company_name":name}
     data["api_key"] = api_key
     CompanyExtraInfoCrawl()._persist(data, "similar", api_key)
Esempio n. 5
0
    def _employees(self, company_name="", keyword=None):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty: 
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company_name]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company_name]
        if keyword:
            results['score'] = [fuzz.partial_ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Esempio n. 6
0
 def _related(self, domain, api_key="", name=""):
     companies = Google().search("related:{0}".format(domain), 10)
     companies = companies.drop_duplicates()
     companies.columns = ['link', 'description', 'title', 'lol', 'lmao']
     data = {
         'data': companies.to_dict('r'),
         "domain": domain,
         "company_name": name
     }
     data["api_key"] = api_key
     CompanyExtraInfoCrawl()._persist(data, "similar", api_key)
Esempio n. 7
0
    def _employees(self, company_name="", keyword="", api_key=""):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty:
            print "No employees found for", company_name, keyword, api_key
            return results

        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company_name
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company_name
            ]
        if keyword != "":
            results['score'] = [
                fuzz.ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]

        results = results[results.company_score > 64]
        results = results.drop_duplicates()
        results["company_id"] = api_key
        """"
        data = {'data': results.to_dict('r'), 'company_name':company_name}
        CompanyExtraInfoCrawl()._persist(data, "employees", "")

        job = rq.get_current_job()
        print job.meta.keys()
        if "queue_name" in job.meta.keys():
          if RQueue()._has_completed(job.meta["queue_name"]):
            q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
        """
        return results
Esempio n. 8
0
    def _press_releases(self, qry, company_domain=None, period=None):
        queries = [
            '"{0}" site:prnewswire.com'.format(qry),
            '"{0}" site:businesswire.com'.format(qry),
            '"{0}" site:marketwired.com'.format(qry),
            '"{0}" site:newswire.ca'.format(qry),
            '"{0}" site:reuters.com'.format(qry)
        ]

        p = Google()._multi_get(queries)
        try:
            p = p.drop_duplicates()
        except:
            """ """
        #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span]
        p['description'] = [
            "".join(span.split('...')[1:]).strip() for span in p.link_span
        ]
        p["domain"] = company_domain
        p['date'] = [span.split('...')[0].strip() for span in p.link_span]
        p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date]
        p['title'] = p['link_text']

        p = p.drop('link_text', 1)
        p = p.drop('url', 1)
        p = p.drop('link_span', 1)
        #for i in p.timestamp: print i
        data = p
        data["domain"] = company_domain
        data["domain"] = company_domain
        data["event_type"] = "CompanyPressEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = [
            "".join(map(str, _data.values()))[:124]
            for _data in data.to_dict("r")
        ]
        _df = data.to_dict("r")
        for i in _df:
            for key in i.keys():
                if i[key] == None: del i[key]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data
Esempio n. 9
0
    def _parse_response(self, html, company_name, keyword=None):
        results = Google()._results_html_to_df(html)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty: 
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company_name]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company_name]
        if keyword:
            results['score'] = [fuzz.partial_ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Esempio n. 10
0
    def _press_releases(self, qry, company_domain=None, period=None):
        queries = ['"{0}" site:prnewswire.com'.format(qry),
                   '"{0}" site:businesswire.com'.format(qry),
                   '"{0}" site:marketwired.com'.format(qry),
                   '"{0}" site:newswire.ca'.format(qry),
                   '"{0}" site:reuters.com'.format(qry)]

        p = Google()._multi_get(queries)
        try:
          p = p.drop_duplicates()
        except:
          """ """
        #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span]
        p['description'] = ["".join(span.split('...')[1:]).strip() for span in p.link_span]
        p["domain"] = company_domain
        p['date'] = [span.split('...')[0].strip() for span in p.link_span]
        p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date]
        p['title'] = p['link_text']

        p = p.drop('link_text',1)
        p = p.drop('url',1)
        p = p.drop('link_span',1)
        #for i in p.timestamp: print i
        data = p
        data["domain"] = company_domain
        data["domain"] = company_domain
        data["event_type"] = "CompanyPressEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = ["".join(map(str, _data.values()))[:124]
                             for _data in data.to_dict("r")]
        _df = data.to_dict("r")
        for i in _df:
            for key in i.keys():
                if i[key] == None: del i[key]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data