Beispiel #1
0
 def __init__(self):
     self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()))
     self._opener.addheaders = [('User-agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')]
     self._pageTotal = 0
     self._currentPage = 0
     with open('config.txt', 'r') as fd:
         conf = json.load(fd)
         self._db = MysqlDb(conf['user'], conf['password'], conf['db'], conf['host'], int(conf['port'])).set_table(conf['table'])
Beispiel #2
0
class Statics:
    def __init__(self):
        with open('config.txt', 'r') as fd:
            conf = json.load(fd)
            self._db = MysqlDb(conf['user'], conf['password'], conf['db'], conf['host'], int(conf['port'])).set_table(conf['table'])
    def generate_image_bar(self, field, xname, yname, title, condDict=None):
        res = self._db.query_group_count(field, condDict)
        labels = []
        weights = []
        for r in res:
            labels.append(r[0])
            weights.append(r[1])
        draw.draw_bar(labels, weights, xname, yname, title)
    def generate_image_circle(self, field, title, condDict=None):
        res = self._db.query_group_count(field, condDict)
        labels = []
        weights = []
        for r in res:
            labels.append(r[0])
            weights.append(r[1])
        draw.draw_circle(labels, weights, title)
Beispiel #3
0
 def __init__(self):
     with open('config.txt', 'r') as fd:
         conf = json.load(fd)
         self._db = MysqlDb(conf['user'], conf['password'], conf['db'], conf['host'], int(conf['port'])).set_table(conf['table'])
Beispiel #4
0
class LaGou:
    def __init__(self):
        self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()))
        self._opener.addheaders = [('User-agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')]
        self._pageTotal = 0
        self._currentPage = 0
        with open('config.txt', 'r') as fd:
            conf = json.load(fd)
            self._db = MysqlDb(conf['user'], conf['password'], conf['db'], conf['host'], int(conf['port'])).set_table(conf['table'])
    def get_cities(self, jobType):
        url = 'http://www.lagou.com/zhaopin/' +job_attrs[jobType]['name'] + '/?labelWords=label'
        resp = self._opener.open(url).read().decode('utf8')
        soup = BeautifulSoup(resp, 'html.parser')
        cities = soup.find_all('a', attrs={'data-lg-tj-cid': 'idnull', 'data-lg-tj-id':'8o00'})
        res = []
        for city in cities:
            addr = city.get_text()
            if addr == '全国':
                continue
            res.append(addr)
        citiesex = soup.find_all('a', attrs={'data-lg-tj-cid': 'idnull', 'data-lg-tj-id':'8q00'})
        for c in citiesex:
            res.append(c.get_text())
        return res
    def split_str2int(self, raw, splitChar,tripChar, defaultValue = -1):
        if raw == '不限':
            return (-1, -1)
        elif raw =='应届毕业生':
            return (0, -1)
        print(raw)
        if raw.find(splitChar) != -1:
            low, high = raw.split(splitChar)
            l = int(low[0: -1] if low.find(tripChar) != -1 else low)
            h = int(high[0: -1] if high.find(tripChar) != -1 else high)
        else:
            l = raw[0: raw.index(tripChar) if raw.find(tripChar) != -1 else raw]
            h = defaultValue
        return (l, h)
    def translate(self, jsData):
        res = {
               'job_id': jsData['positionId'],
               'job_name': jsData['positionName'],
               'job_type': job_attrs[jsData['positionType']]['type'],
#                'job_type': 0 if jsData['positionType'] == '后端开发' else 1,
               'job_first_type': 0 if jsData['positionFirstType'] == '技术' else 1,
               'education': jsData['education'],
               'company_id':jsData['companyId'],
               'company_full_name':jsData['companyName'],
               'company_short_name': jsData['companyShortName'],
               'company_labels': ','.join(jsData['companyLabelList']),
               'boss_name': jsData['leaderName'],
               'industry_field': jsData['industryField'],
               'finance_stage': jsData['financeStage'],
               'job_nature': jsData['jobNature'],
               'city': jsData['city'],
               'plus': 1 if jsData['plus'] == '是' else 0,
               'create_time': jsData['createTime'],
               'advantage':  jsData['positionAdvantage']
               }
        res['salary_low'], res['salary_high'] = self.split_str2int(jsData['salary'], '-', 'k', -1)
        res['work_year_low'], res['work_year_high'] = self.split_str2int(jsData['workYear'], '-', '年', -1)
        res['staffs_low'], res['staffs_high'] = self.split_str2int(jsData['companySize'], '-', '人', -1)
        return res
#         if jsData['salary'].find('-'):
#             low, high = jsData['salary'].split('-')
#             res['salary_low'] = low[0: -1]
#             res['salary_high'] = high[0: -1]
#         else:
#             res['salary_low'] = jsData['salary'][0: jsData['salary'].index('k')]
#             res['salary_high'] = -1
    def get_jobs(self, jobType = '后端开发'):
        for city in self.get_cities(jobType):
            self.get_job(city, jobType)
    def get_job(self, city, jobType):
#         url = 'http://www.lagou.com/zhaopin/houduankaifa/?labelWords=label'
#         resp = self._opener.open(url).read().decode('utf8')
#         print(resp)
#         soup = BeautifulSoup(resp, 'html.parser')
        dataUrl = 'http://www.lagou.com/jobs/positionAjax.json?px=default&city=' + urllib.parse.quote(city)
        print(dataUrl)
        postData = {'first':'true','pn':1, 'kd':jobType}
        resp = self._opener.open(dataUrl, data = urllib.parse.urlencode(postData).encode('utf8')).read().decode('utf8')
        js = json.loads(resp)
        if js['success'] == False:
            return False
        self._pageTotal = js['content']['totalPageCount']
        self._currentPage = 1
        while self._currentPage <= self._pageTotal:
            for item in js['content']['result']:
                print(item)
                self._db.insert(self.translate(item))
            self._currentPage += 1
            print('*************move to next page: %d**************' % self._currentPage)
            postData = {'first':'false','pn':self._currentPage, 'kd':jobType}
            resp = self._opener.open(dataUrl, data = urllib.parse.urlencode(postData).encode('utf8')).read().decode('utf8')
            js = json.loads(resp)
        print("doneeeeeeeee")