Exemple #1
0
 def __init__(self, threadNum):
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
     self.URL = "http://www.lagou.com/"
     self.position = []  # 使用元祖储存职业名
     self.q_req = Queue()
     self.threadNum = threadNum
     self.lagou_db = LGDB()
Exemple #2
0
class LG:
    def __init__(self, threadNum):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
        self.URL = "http://www.lagou.com/"
        self.position = []  # 使用元祖储存职业名
        self.q_req = Queue()
        self.threadNum = threadNum
        self.lagou_db = LGDB()

    def getPosition(self):
        pageCode = self.getPageCode(self.URL)
        query = PyQuery(pageCode)
        positionData = query(".menu_sub.dn .reset dd a")

        for i in range(positionData.length):
            data = positionData.eq(i)
            name = data.text()
            link = data.attr("href")
            self.position.append((name, link))

        print("获取职业列表成功!")
        return self.position

    def getPageCode(self, url):
        time.sleep(random.randint(0,5))
        try:
            return requests.get(url, headers=self.headers).content.decode('utf-8')
        except Exception:
            print("*******连接有误********")
            return None

    def getJobList(self, kd):
        # 判断职位是否已被记录过
        if self.lagou_db.isRecordJobName(kd):
            print("数据库已记录过", kd)
            return None

        jobsId = []
        for index in range(1, 31):
            data = {'kd': kd, 'pn': index}
            time.sleep(1)
            jsonData = requests.post(
                "http://www.lagou.com/jobs/positionAjax.json?",
                data=data,
                headers=self.headers
            )
            jobs = jsonData.json()["content"]["result"]
            if not len(jobs):
                break
            print("开始获取%s的数据第%d页的%d条数据" % (kd, index, len(jobs)))
            # 遍历数据,并为其加入主键
            for job in jobs:
                # 将主键加入到job中, 并加入到数据库中
                id = job['positionId']
                job["_id"] = id
                job["companyLogo"] = "http://www.lagou.com/" + job["companyLogo"]
                jobsId.append(id)
                self.lagou_db.addJob(job)

        print("%s的职位录入完毕!一共%d条数据" % (kd, len(jobsId)))
        # 若录入完毕后,将其存到数据库中,下一次将不再获取
        self.lagou_db.recordToSave(kd)

    def workingThread(self):
        while True:
            kd = self.q_req.get()
            self.getJobList(kd)
            time.sleep(1)
            self.q_req.task_done()

    def run(self):
        # 先获取职业列表和链接
        position = self.getPosition()
        for name, url in position:
            self.q_req.put(name)

        for i in range(self.threadNum):
            t = Thread(target=self.workingThread)
            t.setDaemon(True)
            t.start()

        self.q_req.join()