class RegPage(object): def __init__(self, master=None): self.root = master # 定义内部变量root self.root.geometry('%dx%d' % (300, 200)) # 设置窗口大小 self.username = StringVar() self.password = StringVar() self.repassword = StringVar() self.createPage() self.mysqlClient = MysqlClient() def createPage(self): self.page = Frame(self.root) # 创建Frame self.page.pack() Label(self.page).grid(row=0, stick=W) Label(self.page, text='账户: ').grid(row=1, stick=W, pady=10) Entry(self.page, textvariable=self.username).grid(row=1, column=1, stick=E) Label(self.page, text='密码: ').grid(row=2, stick=W, pady=10) Entry(self.page, textvariable=self.password, show='*').grid(row=2, column=1, stick=E) Label(self.page, text='确认密码: ').grid(row=3, stick=W, pady=10) Entry(self.page, textvariable=self.repassword, show='*').grid(row=3, column=1, stick=E) # Button(self.page, text='退出', command=self.page.quit).grid(row=3, column=1, stick=E) Button(self.page, text='注册', command=self.register).grid(row=4, column=1, stick=E) def register(self): name = self.username.get() password = self.password.get() repassword = self.repassword.get() if name.strip() == '' or password.strip() == '' or repassword.strip( ) == '': showinfo(title='注册失败', message='账户名或者密码不能为空') return if password == repassword: sql = "select * from user where(name='%s')" % (name) find_res = self.mysqlClient.find_one(sql) if find_res: showinfo(title='错误', message='该用户已存在') else: sql = "insert into user(name,password) values ('%s','%s')" % ( name, password) add_res = self.mysqlClient.save(sql) if add_res: showinfo(title='注册成功', message='注册成功') self.page.destroy() login = LoginPage.LoginPage(self.root) else: showinfo(title='注册失败', message='注册失败') else: showinfo(title='错误', message='两次输入的密码不一致')
class LoginPage(object): def __init__(self, master=None): self.root = master # 定义内部变量root self.root.geometry('%dx%d' % (300, 180)) # 设置窗口大小 self.username = StringVar() self.password = StringVar() self.createPage() self.mysqlClient = MysqlClient() def createPage(self): self.page = Frame(self.root) # 创建Frame self.page.pack() Label(self.page).grid(row=0, stick=W) Label(self.page, text='账户: ').grid(row=1, stick=W, pady=10) Entry(self.page, textvariable=self.username).grid(row=1, column=1, stick=E) Label(self.page, text='密码: ').grid(row=2, stick=W, pady=10) Entry(self.page, textvariable=self.password, show='*').grid(row=2, column=1, stick=E) Button(self.page, text='登陆', command=self.loginCheck).grid(row=3, stick=W, pady=10) # Button(self.page, text='退出', command=self.page.quit).grid(row=3, column=1, stick=E) Button(self.page, text='注册', command=self.register).grid(row=3, column=1, stick=E) def loginCheck(self): name = self.username.get() secret = self.password.get() sql = "select * from user where(name='%s' and password='******')" % ( name, secret) find_res = self.mysqlClient.find_one(sql) if find_res: self.page.destroy() MainPage(self.root) config.USERNAME = find_res[1] else: showinfo(title='错误', message='账号或密码错误!') def register(self): self.page.destroy() RegPage(self.root)
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() self.redisClient = RedisClient() def run(self): #self.get_qu() #self.get_zhen() # self.push_url_to_redis() self.get_position() def get_qu(self): sql = 'select * from shi' results = self.db.find_all(sql) for res in results: shi_id = res[2] url = SHI_URL.format(shi_id='c' + shi_id) print(url) html = self.download.get_html(url) if html.status_code == 200 and html is not None: html = HTML(html.text) qu_id_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/@href' ) qu_name_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/text()' ) for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]): qu_id = qu_id.split('/') qu_id = qu_id[2] sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_zhen(self): sql = 'select * from qu' results = self.db.find_all(sql) for res in results: shi_id = res[1] qu_id = res[2] url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id) print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) zhen_id_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/@href' ) zhen_name_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/text()' ) for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]): zhen_id = zhen_id.split('/') zhen_id = zhen_id[2] sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_position(self): redis_results = self.redisClient.pop('employment') try: json_obj = json.loads(redis_results[1].decode('utf8')) except: return None if json_obj: flag = True pageToken = 1 #处理翻页问题 while flag: detail_url_list = [] url = json_obj['url'] pre_page = re.search('\/\?page=(.*?)&', url).group(1) if int(pageToken) > 10: break url = url.replace( 'page=' + pre_page + '&sort=2&ka=page-' + pre_page, 'page=' + str(pageToken) + '&sort=2&ka=page-' + str(pageToken)) cityId = json_obj['cityId'] zhiweiId = json_obj['zhiweiId'] print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时) li_xpath = html.xpath('//div[@class="job-list"]/ul/li') for li in li_xpath: content = etree.tostring(li) content = HT.unescape(content.decode()) content = HTML(content) li_time = content.xpath( 'string(//div[@class="info-publis"]/p)') href_url = content.xpath( 'string(//div[@class="info-primary"]//h3/a/@href)') try: last_str = li_time.split('发布于')[1] minute = last_str.split(':')[1] #判断是否当天发布 if minute: #判断数据库存不存在: try: cid = re.match('^/job_detail/(.*?)\.html', href_url).group(1) sql = "select * from positions where cid='%s'" % ( cid) find_one_res = self.db.find_one(sql) if find_one_res is None: #先把cid插入,避免重复抓取 sql = "insert into positions(cid) values ('%s')" % ( cid) self.db.save(sql) detail_url_list.append( config.HOST_URL + href_url) elif find_one_res[2] is None: detail_url_list.append( config.HOST_URL + href_url) else: print('数据库存在该记录:' + str(cid)) except: print('查询数据库出错:' + str(cid)) except: print('该URL发布日期小于当天:' + config.HOST_URL + href_url) results = self.get_detail(detail_url_list, cityId, zhiweiId) #判断是否翻页 try: last_li = html.xpath( 'string(//div[@class="job-list"]/ul/li[last()]//div[@class="info-publis"]/p)' ) last_str = last_li.split('发布于')[1] minute = last_str.split(':')[1] if minute: pageToken = str(int(pageToken) + 1) except: flag = False else: print('该url无数据') def get_detail(self, detail_url_list, cityId, zhiweiId): for url in detail_url_list: print('下载该详情页:' + url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) try: cid = re.match( '^https://www.zhipin.com/job_detail/(.*?)\.html', url).group(1) except: print('获取cid失败') continue title = html.xpath('string(//h1)') url = url try: publishDateStr = html.xpath( 'string(//span[@class="time"])').split('发布于')[1] publishDate = int( time.mktime( time.strptime(publishDateStr, "%Y-%m-%d %H:%M"))) except: publishDateStr = None publishDate = None try: info = html.xpath( 'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)' ) info = info.split(':') city = info[1][:-2] jingyan = info[2][:-2] xueli = info[3] except: city = None jingyan = None xueli = None price = html.xpath( 'string(//div[@class="info-primary"]//span[@class="badge"])' ) posterName = html.xpath('string(//h2)') posterId = None posterUrl = html.xpath( 'string(//div[@class="detail-figure"]/img/@src)') content = html.xpath( 'string(//div[@class="job-sec"]/div[@class="text"])' ).strip() try: company_text = html.xpath( 'string(//a[@ka="job-cominfo"]/@href)') companyID = re.match('/gongsi/(.*?)\.html', company_text).group(1) except: companyID = None createDate = int(time.time()) #判断是否是当天发布 temp_time = time.localtime(int(time.time())) now_DateStr = time.strftime("%Y-%m-%d", temp_time) lt = time.strptime(now_DateStr, "%Y-%m-%d") now_timestamp = int(time.mktime(lt)) if publishDate == None or publishDate < now_timestamp or publishDate >= ( now_timestamp + 86400): print('特例.该url不是当天发布:' + str(url)) continue res_obj = { 'cid': cid, 'title': title, 'url': url, 'publishDateStr': publishDateStr, 'publishDate': publishDate, 'city': city, 'jingyan': jingyan, 'xueli': xueli, 'price': price, 'posterName': posterName, 'posterId': posterId, 'posterUrl': posterUrl, 'content': content, 'companyID': companyID, 'createDate': createDate, 'cityId': cityId, 'zhiweiId': zhiweiId } print(res_obj) sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\ + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \ %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId) self.db.save(sql) else: print('请求详情页失败:' + str(url)) def push_url_to_redis(self): # zhiwei_list = [] # zhiwei_sql = 'select * from zhiwei' # zhiwei_results = self.db.find_all(zhiwei_sql) # for zhiwei in zhiwei_results: # zhiwei_list.append(zhiwei[2]) # # zhen_sql = 'select * from zhen' # zhen_results = self.db.find_all(zhen_sql) # # for res in zhen_results: # pid = res[1] # zhen_id = res[2] # for zhiwei_id in zhiwei_list: # url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1') # self.redisClient.push('employment',url) zhiwei_list = [] zhiwei_sql = 'select * from zhiwei' zhiwei_results = self.db.find_all(zhiwei_sql) for zhiwei in zhiwei_results: zhiwei_list.append(zhiwei[2]) shi_sql = 'select * from shi' shi_results = self.db.find_all(shi_sql) for res in shi_results: pid = res[2] for zhiwei_id in zhiwei_list: url = NEW_POSITION_URL.format(pid=pid, zhiwei_id=zhiwei_id, pageToken='1') url_obj = {"url": url, "cityId": pid, "zhiweiId": zhiwei_id} self.redisClient.push('employment', json.dumps(url_obj))
class InputFrame(Frame): # 继承Frame类 def __init__(self, master=None): Frame.__init__(self, master) self.root = master # 定义内部变量root self.word = StringVar() self.mean = StringVar() self.createPage() self.mysqlClient = MysqlClient() self.spider = spider.Spider() def createPage(self, query_res={ 'fanyi': '', 'phonetic': '', 'translation': '' }): # self.fm2 = Frame(self.root) # Button(self, text='Left').pack(side=LEFT) # Button(self, text='This is the Center button').pack(side=LEFT) # Button(self, text='Right').pack(side=LEFT) # self.fm2.pack(side=LEFT, padx=10) Label(self).grid(row=0, stick=W, pady=10) Label(self, text='请输入: ').grid(row=1, stick=W, pady=10) Entry(self, textvariable=self.word, width=40).grid(row=2, stick=W) Label(self, text='结果如下: ').grid(row=4, stick=W, pady=10) Label(self, text='翻译:' + query_res['fanyi'], height=2, width=40, justify='left').grid(row=5, stick=W, pady=10) Label(self, text='发音:' + query_res['phonetic'], height=2, width=40, justify='left').grid(row=6, stick=W, pady=10) Label(self, text='其他:' + query_res['translation'], height=2, width=40, justify='left').grid(row=7, stick=W, pady=10) # Entry(self, textvariable=self.mean).grid(row=3, column=1, stick=E) # Text(self, height=5, width=50).grid(row=4, column=1, stick=W) Button(self, text='查询', command=self.query_word).grid(row=10, column=1, stick=E, pady=10) Button(self, text='收藏', command=self.collect_word).grid(row=10, column=2, stick=E, pady=10) def query_word(self): word = self.word.get() #是英文 eng = '1' # 是中文 if '\u4e00' <= word[0] <= '\u9fff': eng = '0' #先判断在不在当前数据库,不在再用360翻译接口 sql1 = "select * from words where en_word like '%s'" % (word) find_res1 = self.mysqlClient.find_one(sql1) sql2 = "select * from words where cn_word like '%s'" % (word) find_res2 = self.mysqlClient.find_one(sql2) if find_res1: query_res = { 'fanyi': find_res1[2], 'phonetic': '', 'translation': '', } self.createPage(query_res) elif find_res2: query_res = { 'fanyi': find_res2[1], 'phonetic': '', 'translation': '', } self.createPage(query_res) else: query_res = self.spider.get_dic(word, eng=eng) if query_res: self.createPage(query_res) else: showinfo(title='查询失败', message='查询失败,请检查您的网络') #添加到历史查询记录 name = config.USERNAME mean = json.dumps(query_res['fanyi']) mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) timestamp = int(time.time()) sql = "insert into history(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % ( word, mean, name, mytime, timestamp) print(sql) self.mysqlClient.save(sql) def collect_word(self): word = self.word.get() # 是英文 eng = '1' # 是中文 if '\u4e00' <= word[0] <= '\u9fff': eng = '0' # 先判断在不在当前数据库,不在再用360翻译接口 sql1 = "select * from words where en_word like '%s'" % (word) find_res1 = self.mysqlClient.find_one(sql1) sql2 = "select * from words where cn_word like '%s'" % (word) find_res2 = self.mysqlClient.find_one(sql2) if find_res1: query_res = { 'fanyi': find_res1[2], 'phonetic': '', 'translation': '', } self.createPage(query_res) elif find_res2: query_res = { 'fanyi': find_res2[1], 'phonetic': '', 'translation': '', } self.createPage(query_res) else: query_res = self.spider.get_dic(word, eng=eng) if query_res: self.createPage(query_res) else: showinfo(title='查询失败', message='查询失败,请检查您的网络') # 添加到我的收藏 name = config.USERNAME mean = str(query_res['fanyi']) mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) timestamp = int(time.time()) sql = "insert into collection(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % ( word, mean, name, mytime, timestamp) print(sql) save_res = self.mysqlClient.save(sql) if save_res: showinfo(title='成功', message='收藏成功') else: showinfo(title='失败', message='收藏失败!')