Example #1
0
 def parse1(self, response):
     global companyCombo,urlCounter,urlCompany
     sel = Selector(response)
     db = DbOperations()
     for companyAttr in companyCombo:
         if(companyAttr['url'] in response.url):
             company = companyAttr
             break
     jobs = sel.xpath('//div[contains(@type,"tuple")]').extract()
     if len(jobs) > 0:
         for job in jobs:
             jobAttr = self.getJobAttributes(job)
             if(jobAttr):
                 db.insertJob(jobAttr,company['ro_id'])
             else:
                 print jobAttr
Example #2
0
 def parse2(self, response):
     global companyCombo,urlCounter,urlCompany
     sel = Selector(response)
     jobs = sel.xpath('//div[contains(@type,"tuple")]').extract()
     db = DbOperations()
     for companyAttr in companyCombo:
         if(companyAttr['url'] in response.url):
             company = companyAttr
             break
         else:
             return
     db.insertJobList(jobs,company['ro_id'])
     try:
         jobNextUrl = sel.xpath('//div[contains(@class,"pagination")]/a/@href').extract()[1]
         request = scrapy.Request(jobNextUrl , callback=self.parseNextUrl)
         yield request
     except:
         print "Unexpected error:", sys.exc_info()[0]
Example #3
0
    def parse(self,response):
        db = DbOperations()
        url = response.url
        url = url.replace("http://jobsearch.naukri.com/","").replace("-jobs","")
        urlCompany = re.sub("-[0-9]","",url)
        pp = pprint.PrettyPrinter(indent=4)
        global companyCombo
        for comp_id in sorted(companyCombo.iterkeys()):
            if urlCompany in companyCombo[comp_id]:
                 fk_comp_id = comp_id
        sel = Selector(response)
        try:
            jobs = sel.xpath('//div[contains(@class,"row")]').extract()
            jobAttr = {}
            for i in range(1,51):
                elementParser = BeautifulSoup(jobs[i])
                try:
                    jobAttr['companyName'] = db.cleanSpacesAndCharacters(elementParser.find("span",itemprop="hiringOrganization").getText())
                except:
                    continue
                if(jobAttr['companyName'].lower().find(urlCompany.lower()) == 0):
                    jobAttr['title'] = db.cleanSpacesAndCharacters(elementParser.find("span",itemprop="title").getText())
                    jobAttr['jobLocation'] = db.cleanSpacesAndCharacters(elementParser.find("span",itemprop="jobLocation").getText())
                    jobAttr['experience'] = db.cleanSpacesAndCharacters(elementParser.find("span",itemprop="experienceRequirements").getText())
                    jobAttr['salary'] = db.cleanSpacesAndCharacters(elementParser.find("span",itemprop="baseSalary").getText())
                    sql = """SELECT *
                            FROM naukri_jobs_3 WHERE
                            fk_company_id = '"""+str(fk_comp_id)+"""'
                            AND jobtitle = '"""+jobAttr['title']+"""'
                            AND location = '"""+jobAttr['jobLocation']+"""'"""
                    result = db.executeQuery(sql)
                    if result:
                        return
                    try:
                        jobAttr['jobSnippet'] = db.cleanSpacesAndCharacters(elementParser.find("span",itemprop="description").getText())
                    except:
                        try:
                            jobAttr['jobSnippet'] = db.cleanSpacesAndCharacters(elementParser.find("div",class_="more").getText())
                        except:
                            continue
                    try:
                        jobAttr['source'] = db.cleanSpacesAndCharacters(elementParser.find("div",class_ = "rec_details").getText())
                    except:
                        jobAttr['source'] = jobAttr['companyName']
                    jobUrl = elementParser.find("a").get("href")
                    jobAttr['jobUrl'] = jobUrl
                    try:
                        jobPage = urllib2.urlopen(jobUrl).read()
                        jobDescriptionParser = BeautifulSoup(jobPage)
                        try:
                            jobAttr['jobDescription'] = ' '.join(jobDescriptionParser.find("ul",itemprop="description").getText().replace("\t","").replace("\n","").split()).replace("'","")
                        except:
                            try:
                                jobAttr['jobDescription'] = ' '.join(jobDescriptionParser.find("div",class_="f14 lh18 alignJ disc-li").getText().replace("\t","").replace("\n","").split()).replace("'","")
                            except:
                                try:
                                    jobAttr['jobDescription'] = jobDescriptionParser.find("meta",{"property":"og:description"})
                                    jobAttr['jobDescription'] = db.cleanSpacesAndCharacters(jobAttr['jobDescription']['content'])
                                except:
                                    try:
                                        jobAttr['jobDescription'] = ' '.join(jobDescriptionParser.findAll("td",{"class":"detailJob"})[2].getText().replace("\t","").replace("\n","").split()).replace("'","")
                                    except:
                                        jobAttr['jobDescription'] = jobAttr['jobSnippet']
                    except:
                        print "hello"

                    sql = """INSERT INTO naukri_jobs_4 SET
                             jobtitle = '"""+jobAttr['title']+"""',
                             snippet = '"""+jobAttr['jobSnippet']+"""',
                             location = '"""+jobAttr['jobLocation']+"""',
                             naukri_company_name = '"""+jobAttr['companyName']+"""',
                             fk_company_id = '"""+str(fk_comp_id)+"""',
                             job_url = '"""+smart_str(jobAttr['jobUrl'])+"""',
                             experience = '"""+smart_str(jobAttr['experience'])+"""',
                             salary = '"""+smart_str(jobAttr['salary'])+"""',
                             full_description = '"""+smart_str(MySQLdb.escape_string(jobAttr['jobDescription']))+"""',
                             source = '"""+jobAttr['source']+"""'"""
                    db.executeQuery(sql)
                else:
                    # print "hello"
                    continue

        except:
            return