def start(self): conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists pmphmooc_open') cur.execute('create table pmphmooc_open(id int(11) primary key auto_increment,title varchar(255),school varchar(255),teacher varchar(255),touxian varchar(255),resume text,hitcount varchar(255),url varchar(255))') sql = 'insert into pmphmooc_open(title,school,teacher,touxian,resume,hitcount,url) values(%s,%s,%s,%s,%s,%s,%s)' content = json.loads(self.getJson()) for item in content["rows"]: oneline = Item() oneline.title = item["name"] oneline.url = 'http://www.pmphmooc.com/web/opencoursedetail?courseid=' + str(item["id"]) oneline.school = item["agencyname"] oneline.hitcount = item["hitcount"] if item.has_key("username"): oneline.teacher = item["username"] if item.has_key("touxian"): oneline.touxian = item["touxian"] if item.has_key("resume"): oneline.resume = item["resume"] value = [] value.append(oneline.title) value.append(oneline.school) value.append(oneline.teacher) value.append(oneline.touxian) value.append(oneline.resume) value.append(oneline.hitcount) value.append(oneline.url) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def start(self): conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists mooccollege') cur.execute('create table mooccollege(id int(11) primary key auto_increment,title varchar(255),teacher varchar(255),school varchar(255),type varchar(255))') sql = 'insert into mooccollege(title,teacher,school,type) values(%s,%s,%s,%s)' for i in range(1,5): oneline = Item() page = self.getPage(i) info = self.getInfo(page) for item in info: # print item[0],item[1],item[2] oneline.title = item[0] oneline.teacher = item[1] oneline.school = item[2] if i == 1: oneline.type = "冲刺专题" elif i == 2: oneline.type = "考题解析" elif i == 3: oneline.type = "同步教材" else: oneline.type = "知识模块" value = [] value.append(oneline.title) value.append(oneline.teacher) value.append(oneline.school) value.append(oneline.type) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def start(self): conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists pmphmooc_open') cur.execute( 'create table pmphmooc_open(id int(11) primary key auto_increment,title varchar(255),school varchar(255),teacher varchar(255),touxian varchar(255),resume text,hitcount varchar(255),url varchar(255))' ) sql = 'insert into pmphmooc_open(title,school,teacher,touxian,resume,hitcount,url) values(%s,%s,%s,%s,%s,%s,%s)' content = json.loads(self.getJson()) for item in content["rows"]: oneline = Item() oneline.title = item["name"] oneline.url = 'http://www.pmphmooc.com/web/opencoursedetail?courseid=' + str( item["id"]) oneline.school = item["agencyname"] oneline.hitcount = item["hitcount"] if item.has_key("username"): oneline.teacher = item["username"] if item.has_key("touxian"): oneline.touxian = item["touxian"] if item.has_key("resume"): oneline.resume = item["resume"] value = [] value.append(oneline.title) value.append(oneline.school) value.append(oneline.teacher) value.append(oneline.touxian) value.append(oneline.resume) value.append(oneline.hitcount) value.append(oneline.url) MysqlHelper.insert_one(cur, sql, value) MysqlHelper.finish(conn)
def start(self): content = json.loads(self.getContent()) conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists fudan') cur.execute('create table if not exists fudan(id int(11) primary key auto_increment,title varchar(255),lesson_code varchar(255),start_time varchar(255),current_sem varchar(255),spend_time varchar(255),short_desc text,knowledge_res text,chapter_info text,common_prob text,teacher_info text,url varchar(255))') sql = 'insert into fudan(title,lesson_code,start_time,current_sem,spend_time,short_desc,knowledge_res,chapter_info,common_prob,teacher_info,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' for item in content["course"]: if not item["about"].find("lecture") == -1: continue value = [] url = "http://fudan.xuetangx.com" + item["about"] page = self.getPage(url) title = self.getTitle(page) value.append(title) info = self.getInfo1(page) for item in info: value.append(item[0] + ':' + self.tool.replace(item[1])) info2 = self.getInfo2(page) for item in info2: value.append(item[0] + ':' + re.sub(r'[\n\t]+',r'\n', self.tool.replace(item[1]), flags=re.S)) for x in range(4 - len(info2)): value.append('') teacherinfo = self.getTeacherInfo(page) teacher = "" for item in teacherinfo: str = item[0] + '\n' + item[1] + '\n' + self.tool.replace(item[2]) + '\n' teacher = teacher + str value.append(teacher) value.append(url) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def start(self): indexPage = self.getPage('http://computer.icourses.cn/') conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists computer_icourses') cur.execute( 'create table computer_icourses(id int(11) primary key auto_increment,title varchar(255),short_desc text,description text,requirement text,pre_knowledge text,chapter text,reference text,common_prob text,teacher text,url varchar(255))' ) sql = 'insert into computer_icourses(title,short_desc,description,requirement,pre_knowledge,chapter,reference,common_prob,teacher,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' url = self.getURL(indexPage) for item in url: oneline = Item() oneline.url = item page = self.getPage(item) title = self.getTitle(page) oneline.title = title shortDesc = self.getShortDesc(page) oneline.short_desc = shortDesc info = self.getInfo(page) for item in info: if item[0] == '课程概述': oneline.description = re.sub(self.tool.replaceNBSP, " ", self.tool.replace(item[1])) if item[0] == '证书要求': oneline.requirement = re.sub(self.tool.replaceNBSP, " ", self.tool.replace(item[1])) if item[0] == '预备知识': oneline.pre_knowledge = re.sub(self.tool.replaceNBSP, " ", self.tool.replace(item[1])) if item[0] == '授课大纲': oneline.chapter = re.sub(self.tool.replaceNBSP, " ", self.tool.replace(item[1])) if item[0] == '参考资料': oneline.reference = re.sub(self.tool.replaceNBSP, " ", self.tool.replace(item[1])) if item[0] == '常见问题': oneline.common_prob = re.sub(self.tool.replaceNBSP, " ", self.tool.replace(item[1])) teacher = self.getTeacher(page) teacherstr = "" for item in teacher: teacherstr = teacherstr + item + '\n' oneline.teacher = teacherstr value = [] value.append(oneline.title) value.append(oneline.short_desc) value.append(oneline.description) value.append(oneline.requirement) value.append(oneline.pre_knowledge) value.append(oneline.chapter) value.append(oneline.reference) value.append(oneline.common_prob) value.append(oneline.teacher) value.append(oneline.url) MysqlHelper.insert_one(cur, sql, value) MysqlHelper.finish(conn)
def export_corpus(): mysql_helper = MysqlHelper('193.168.15.136', 'test', 'test', 'p**n', 'utf8') with open('data/p**n.txt', 'w', encoding='utf-8', errors='ignore') as fporn_write: result_porn = mysql_helper.exeQuery('select Content from p**n') for row in result_porn._rows: fporn_write.write('{}\n'.format(row[0])) with open('data/unporn.txt', 'w', encoding='utf-8') as funporn_write: result_unporn = mysql_helper.exeQuery('select Content from unporn') for row in result_unporn._rows: funporn_write.write('{}\n'.format(row[0]))
class CheckLogin(): def __init__(self): self.mysql_helper = MysqlHelper(host="localhost", user="******", passwd="123456", port=3306, db='python') def get_message(self): self.username = input("用户名:") self.password = input("密码:") hash_password = hashlib.sha1() hash_password.update(self.password.encode("utf-8")) self.hash_password = hash_password.hexdigest() def judge_user(self): sql = "select passwd from user_passwd where username=%s" params = [self.username] self.result = self.mysql_helper.find_get(sql, params) def login(self): self.get_message() #user_passwd self.judge_user() if self.result == (): print("\n用户不存在,请注册") register = input("是否注册[Y/N]:") if register.lower() == "y": self.register() else: print("成功退出") else: if self.result[0][0] == self.hash_password: print("\n*******登录成功********") else: print("\n密码错误") def register(self): self.get_message() self.judge_user() if self.result == (): sql = "insert into user_passwd values(0,%s,%s);" params = [self.username, self.hash_password] self.mysql_helper.cud(sql, params) else: print("用户已存在,请登录") login = input("是否登录[Y/N]:") if login.lower() == "y": self.login() else: print("成功退出")
def getCourceInfo(self): cource_url_list = self.getCourceUrl(self.url) conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists jisuanke') cur.execute('create table jisuanke(id int(11) primary key auto_increment,title varchar(255),time varchar(255),learn_count varchar(255),short_desc text,outline text)') sql = 'insert into jisuanke(title,time,learn_count,short_desc,outline) values(%s,%s,%s,%s,%s)' #file = open("JiSuanke.txt","w+") for pageurl in cource_url_list: value = [] cource_url = "http:" + pageurl page = self.getPageInfo(cource_url) courceName = self.getCourceName(page) title = self.removeTab(courceName[0]) value.append(title) #file.write('\n'+'课程题目:' + title) #print title #print courceName[0] courceTime = self.getCourceTime(page) times = self.removeTab(courceTime[0]) value.append(times) #file.write('\n'+'课程时长:' + times) #print times peopleNum = self.getPeopleNum(page) value.append( peopleNum[0]) #file.write('\n'+'学习人数:' + peopleNum[0]) #print peopleNum[0] #brief = self.getBrief(page) #file.write('\n'+'课程介绍:'+ brief[0]) brief = self.getClassInfo(page) value.append(brief) #print brief #file.write('\n' + '课程介绍:' + brief) courseInfo= self.getInfo(page) #file.write('\n'+'课程目录:') str = "" if courseInfo: for item in courseInfo: str= str + item[0] + ':' + item[1] + '\n' #file.write('\n\t' + item[0] + ':' + item[1]) #print item[0] #print item[1] pattern = re.compile(r'<li>(.*?)</li>',re.S) li = re.findall(pattern, item[2]) for info in li: str = str + info value.append(str) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def insert_data(porn_file, unporn_file): mysql_helper = MysqlHelper('193.168.15.136', 'test', 'test', 'p**n', 'utf8') with open(porn_file, 'r', encoding='utf-8', errors='ignore') as fporn_read: for line in fporn_read: sql_query = "select * from p**n where Content= %s" param_query = line.rstrip() result = mysql_helper.find(sql_query, param_query) if result == 0: sql_insert = 'insert into p**n(ID,Content) values (%s,%s)' md5 = hashlib.md5() md5.update(line.rstrip().encode(encoding='utf-8')) param_insert = md5.hexdigest(), line.rstrip() mysql_helper.cud(sql_insert, param_insert) with open(unporn_file, 'r', encoding='utf-8') as funporn_read: for line in funporn_read: sql_query = "select * from unporn where Content= %s" param_query = line.rstrip() result = mysql_helper.find(sql_query, param_query) if result == 0: sql_insert = 'insert into unporn(ID,Content) values (%s,%s)' md5 = hashlib.md5() md5.update(line.rstrip().encode(encoding='utf-8')) param_insert = md5.hexdigest(), line.rstrip() mysql_helper.cud(sql_insert, param_insert)
def start(self): indexPage = self.getPage('http://computer.icourses.cn/') conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists computer_icourses') cur.execute('create table computer_icourses(id int(11) primary key auto_increment,title varchar(255),short_desc text,description text,requirement text,pre_knowledge text,chapter text,reference text,common_prob text,teacher text,url varchar(255))') sql = 'insert into computer_icourses(title,short_desc,description,requirement,pre_knowledge,chapter,reference,common_prob,teacher,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' url = self.getURL(indexPage) for item in url: oneline = Item() oneline.url = item page = self.getPage(item) title = self.getTitle(page) oneline.title = title shortDesc = self.getShortDesc(page) oneline.short_desc = shortDesc info = self.getInfo(page) for item in info: if item[0] == '课程概述': oneline.description = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1])) if item[0] == '证书要求': oneline.requirement = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1])) if item[0] == '预备知识': oneline.pre_knowledge = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1])) if item[0] == '授课大纲': oneline.chapter = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1])) if item[0] == '参考资料': oneline.reference = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1])) if item[0] == '常见问题': oneline.common_prob = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1])) teacher = self.getTeacher(page) teacherstr = "" for item in teacher: teacherstr = teacherstr + item + '\n' oneline.teacher = teacherstr value = [] value.append(oneline.title) value.append(oneline.short_desc) value.append(oneline.description) value.append(oneline.requirement) value.append(oneline.pre_knowledge) value.append(oneline.chapter) value.append(oneline.reference) value.append(oneline.common_prob) value.append(oneline.teacher) value.append(oneline.url) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def start(self): conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists pmphmooc') cur.execute('create table pmphmooc(id int(11) primary key auto_increment,title varchar(255),description text,chapter text,course_begin varchar(255),course_end varchar(255),course_totaltime varchar(255),course_load varchar(255),teacher text,block text,url varchar(255))') sql = 'insert into pmphmooc(title,description,chapter,course_begin,course_end,course_totaltime,course_load,teacher,block,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' content = json.loads(self.getJson()) for item in content["rows"]: oneline = Item() # print item["name"],item["id"] oneline.title = item["name"] oneline.url = 'http://www.pmphmooc.com/web/scholl/' + str(item["id"]) page = self.getPage(item["id"]) # print page description = self.getDescription(page) oneline.description = self.tool.replace(description) beginAndEnd = self.getBeginAndEnd(page) for item in beginAndEnd: oneline.course_begin = item[0] oneline.course_end = item[1] totalAndLoad = self.getTotalAndLoad(page) oneline.course_totaltime = totalAndLoad[0] oneline.course_load = totalAndLoad[1] teacher = self.getTeacher(page) teastr = "" for item in teacher: teastr = teastr + item[0] + '\n' + item[1] + '\n' oneline.teacher = teastr block = self.getBlock(page) oneline.block = block chapterWords = ["授课大纲","课程章节"] chapter = self.getText(block,chapterWords) if chapter: oneline.chapter = re.sub(self.tool.replaceNBSP,"",self.tool.replace(chapter)) value = [] value.append(oneline.title) value.append(oneline.description) value.append(oneline.chapter) value.append(oneline.course_begin) value.append(oneline.course_end) value.append(oneline.course_totaltime) value.append(oneline.course_load) value.append(oneline.teacher) value.append(oneline.block) value.append(oneline.url) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def loadFoodListComm(sql, type): params = (2,) print(sql % params) rows = MysqlHelper.MysqlHelper().fetchall(sql, params) for row in rows: id, parent_id, url, total_page, current_page = row # total_page = total_page - 30 # if total_page < current_page: # total_page = current_page + 5 for page in range(current_page + 1, total_page + 1): page_ = "?&page=" if "?" in url: page_ = "&page=" # 爬取每页数据 if type == 1: loadFoodListPage(url + page_ + str(page), id, parent_id) else: loadFoodMaterialListPage(url + page_ + str(page), id, parent_id) # 更新页码 updateCurrentPage(id, page) sleepRandom()
def anaylysiscount(): dbhelper = MysqlHelper.DbHelper() total = dbhelper.fetchCount("select count(*) from maoyan") am = dbhelper.fetchCount("select count(*) from 'newdatabase'.'maoyan'where time like '%美国%'") china = dbhelper.fetchCount("select count(*) from 'newdatabase'.'maoyan'where time like '%中国%'") japan = dbhelper.fetchCount("select count(*) from 'newdatabase'.'maoyan'where time like '%日本%'") print(total,am,japan,china)
def select_many(self, sql, params=None): result = None try: result = MysqlHelper(self.r_config).query_many(sql, params) except Exception as e: print('Error:' + str(e)) return result
def loadFoodListPage(url, class1_id, class2_id): divList = getContent(url).xpath('//div[@class="listtyle1"]/a') for div in divList: html_url = div.xpath("./@href")[0] title = div.xpath("./@title")[0] thumbnail_url = div.xpath("img/@src")[0] comment_num = 0 popularity_num = 0 spanTextList = div.xpath("div//span/text()") for spanText in spanTextList: span = spanText.replace(" ", "").encode("utf-8") m = re.compile(r'(\d*)评论(\d*)人气').match(span) if m is not None: comment_num = m.group(1) popularity_num = m.group(2) step_num = 0 liTextList = div.xpath("div//li[@class='li1']/text()") for liText in liTextList: step = liText.replace(" ", "").encode("utf-8") m = re.compile(r'(\d*)步').match(step) if m is not None: step_num = m.group(1) sql = "insert into lb_food ( name , class1_id , class2_id , comment_num , popularity_num ," \ " step_num , html_url , thumbnail_url) values ( %s, %s, %s, %s, %s, %s, %s, %s )" params = ( title, class1_id, class2_id, comment_num, popularity_num, step_num, html_url, thumbnail_url) # print(sql % params) MysqlHelper.MysqlHelper().cud(sql, params)
def __init__(self, model, thread_size, gpu_card): """ init """ self.config = {} try: cf = ConfigParser.ConfigParser() cf.read("../conf/load_config.conf") self.mysql_host = cf.get("db", "mysql_host") self.mysql_port = cf.getint("db", "mysql_port") self.mysql_user = cf.get("db", "mysql_user") self.mysql_passwd = cf.get("db", "mysql_passwd") #TODO conf_name = "conf_%s" % model try: self.test_db = cf.get(conf_name, "test_db") % thread_size except Exception as e: self.test_db = cf.get("db", "test_db") self.mysql = mysql_helper.MysqlHelper(host = self.mysql_host,\ port = self.mysql_port, user = self.mysql_user, \ passwd = self.mysql_passwd, db = self.test_db) self.gpu_card = gpu_card except Exception as exception: print exception return
def __init__(self, model, batch_size, gpu_card): """ init """ self.config = {} try: cf = ConfigParser.ConfigParser() cf.read("../conf/load_config.conf") self.mysql_host = cf.get("db", "mysql_host") self.mysql_port = cf.getint("db", "mysql_port") self.mysql_user = cf.get("db", "mysql_user") self.mysql_passwd = cf.get("db", "mysql_passwd") #TODO conf_name = "conf_%s" % model try: self.test_db = cf.get(conf_name, "test_db") % batch_size except Exception as e: print( "\033[0;31;m[error]: Pls Check The Modle input wrong!\033[0m" ) sys.exit(1) self.mysql = mysql_helper.MysqlHelper(host = self.mysql_host,\ port = self.mysql_port, user = self.mysql_user, \ passwd = self.mysql_passwd, db = self.test_db) self.gpu_card = gpu_card except Exception as exception: print exception return
def spiderProcess(food_id, content): divStepList = content.xpath("//div[@class='editnew edit']/div[@class='content clearfix']") count = 0 for div in divStepList: step = "" content = "" img_url = "" stepList = div.xpath("em/text()") if len(stepList) > 0: step = stepList[0] stepList = div.xpath("div/p/text()") if len(stepList) > 0: content = stepList[0] stepList = div.xpath("div/p/img/@src") if len(stepList) > 0: img_url = stepList[0] sql = "insert into lb_cook_process (food_id, step, content, img_url)" \ " values ( %s, %s, %s, %s )" params = (food_id, step, content, img_url) # print(sql % params) if MysqlHelper.MysqlHelper(isTest).cud(sql, params) == 1: count += 1 return count == len(divStepList)
def execute_many(self, sql, params=None): result = False try: i = MysqlHelper(self.w_config).execute_many(sql, params) result = i > 0 except Exception as e: print('Error:' + str(e)) return result
def updateSpiderDetail(error_msg, food_id, is_spider_detail_OK = False): sql = "update lb_food set is_spider_detail = %s, error_msg = %s where id = %s " is_spider_detail = 2 if is_spider_detail_OK: is_spider_detail = 1 params = (is_spider_detail, error_msg, food_id) print(sql % params) MysqlHelper.MysqlHelper(isTest).cud(sql, params)
def start(self): indexPage = self.getContent(1) pageNum = self.getPageNum(indexPage) conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists imooc') cur.execute( 'create table imooc(id int(11) primary key auto_increment,title varchar(255),difficulty varchar(255),time varchar(255),learn_count varchar(255),short_desc text,outline text)' ) sql = 'insert into imooc(title,difficulty,time,learn_count,short_desc,outline) values(%s,%s,%s,%s,%s,%s)' for i in range(1, int(pageNum) + 1): indexPage = self.getContent(i) ViewsId = self.getViewsId(indexPage) for item in ViewsId: value = [] learnpage = self.getLearnPage(item) viewpage = self.getViewPage(item) title = self.getTitle(learnpage) value.append(title) info = self.getLevelTimeAndCount(learnpage) infos = [] for item in info: item = self.tool.replace(item) infos.append(item) value.append(infos[0]) value.append(infos[1]) value.append(infos[2]) brief = self.getBrief(viewpage) value.append(brief) outline = self.getOutline(learnpage) str = "" for item in outline: str = str + self.tool.replace(item[0]) + '\n' pattern = re.compile('<li>(.*?)</li>', re.S) result = re.findall(pattern, item[1]) if result: for item in result: item = re.sub(self.tool.removeAddr, "", item) item = re.sub(self.tool.replaceLT, "<", item) item = re.sub(self.tool.replaceGT, ">", item) str = str + item.strip() + '\n' value.append(str) MysqlHelper.insert_one(cur, sql, value) MysqlHelper.finish(conn)
def start(self): indexPage = self.getContent(1) pageNum = self.getPageNum(indexPage) conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists imooc') cur.execute('create table imooc(id int(11) primary key auto_increment,title varchar(255),difficulty varchar(255),time varchar(255),learn_count varchar(255),short_desc text,outline text)') sql = 'insert into imooc(title,difficulty,time,learn_count,short_desc,outline) values(%s,%s,%s,%s,%s,%s)' for i in range(1,int(pageNum)+1): indexPage = self.getContent(i) ViewsId = self.getViewsId(indexPage) for item in ViewsId: value = [] learnpage = self.getLearnPage(item) viewpage = self.getViewPage(item) title = self.getTitle(learnpage) value.append(title) info = self.getLevelTimeAndCount(learnpage) infos = [] for item in info: item = self.tool.replace(item) infos.append(item) value.append(infos[0]) value.append(infos[1]) value.append(infos[2]) brief = self.getBrief(viewpage) value.append(brief) outline = self.getOutline(learnpage) str = "" for item in outline: str = str + self.tool.replace(item[0]) + '\n' pattern = re.compile('<li>(.*?)</li>',re.S) result = re.findall(pattern,item[1]) if result: for item in result: item = re.sub(self.tool.removeAddr,"",item) item = re.sub(self.tool.replaceLT,"<",item) item = re.sub(self.tool.replaceGT,">",item) str = str + item.strip() + '\n' value.append(str) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def getCpuAndMemory(self): list_cpu = [] list_vss = [] list_rss = [] packageName = self.packageName saveFileName = self.saveFileName file = self.file infofile = self.infofile line = file.readline() while line: temp_result = line.replace('\n', '').split() if temp_result[9] == packageName: infofile.writelines(line) line = file.readline() infofile.close() resultFile = open(saveFileName) resultLine = resultFile.readline() while resultLine: temp_line = resultLine.replace('\n', '').split() list_cpu.append(int(temp_line[2][:-1])) list_vss.append(int(temp_line[5][:-1])) list_rss.append(int(temp_line[6][:-1])) resultLine = resultFile.readline() resultFile.close() cpumax = max(list_cpu) cpuavg = "%.2f" % (float(sum(list_cpu)) / len(list_cpu)) vsizemax = max(list_vss) vsizeavg = sum(list_vss) / len(list_vss) rssmax = max(list_rss) rssavg = sum(list_rss) / len(list_rss) print cpumax, cpuavg, vsizemax, vsizeavg, rssmax, rssavg sqlquery = ("UPDATE %s " + "SET cpumax = '%s',cpuavg = '%s'," + "vsizemax = '%s',vsizeavg = '%s'," + "rssmax = '%s',rssavg = '%s' " + "WHERE devicemodel = '%s' ") % ( self.appName, cpumax, cpuavg, vsizemax, vsizeavg, rssmax, rssavg, self.deviceModel) print sqlquery cxn = MysqlHelper.connect() cur = cxn.cursor() res = MysqlHelper.update(cur, sqlquery) MysqlHelper.finish(cxn)
def getCpuAndMemory(self): list_cpu = [] list_vss = [] list_rss = [] packageName = self.packageName saveFileName = self.saveFileName file = self.file infofile = self.infofile line = file.readline() while line: temp_result = line.replace('\n','').split() if temp_result[9] == packageName: infofile.writelines(line) line = file.readline() infofile.close() resultFile = open(saveFileName) resultLine = resultFile.readline() while resultLine: temp_line = resultLine.replace('\n','').split() list_cpu.append(int(temp_line[2][:-1])) list_vss.append(int(temp_line[5][:-1])) list_rss.append(int(temp_line[6][:-1])) resultLine = resultFile.readline() resultFile.close() cpumax = max(list_cpu) cpuavg = "%.2f" % (float(sum(list_cpu))/len(list_cpu)) vsizemax = max(list_vss) vsizeavg = sum(list_vss)/len(list_vss) rssmax = max(list_rss) rssavg = sum(list_rss)/len(list_rss) print cpumax,cpuavg,vsizemax,vsizeavg,rssmax,rssavg sqlquery = ("UPDATE %s " + "SET cpumax = '%s',cpuavg = '%s'," + "vsizemax = '%s',vsizeavg = '%s'," + "rssmax = '%s',rssavg = '%s' " + "WHERE devicemodel = '%s' ")%(self.appName,cpumax,cpuavg,vsizemax,vsizeavg,rssmax,rssavg,self.deviceModel) print sqlquery cxn = MysqlHelper.connect() cur = cxn.cursor() res = MysqlHelper.update(cur , sqlquery) MysqlHelper.finish(cxn)
def start(self): content = json.loads(self.getContent()) #file = open("tsinghua.txt","w+") conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists tsinghua') cur.execute('create table if not exists tsinghua(id int(11) primary key auto_increment,title varchar(255),lesson_code varchar(255),start_time varchar(255),current_sem varchar(255),spend_time varchar(255),short_desc text,knowledge_res text,chapter_info text,common_prob text,teacher_info text,url varchar(255))') sql = 'insert into tsinghua(title,lesson_code,start_time,current_sem,spend_time,short_desc,knowledge_res,chapter_info,common_prob,teacher_info,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' for item in content["course"]: if not item["about"].find("lecture") == -1: continue value = [] #url = self.getUrl(content) #把url符号变成unicode形式 #page = self.getPage(urllib.quote_plus("http://tsinghua.xuetangx.com/courses/TSINGHUA/MOOC001/2014_T2/about")) url = "http://tsinghua.xuetangx.com" + item["about"] page = self.getPage(url) title = self.getTitle(page) value.append(title) info = self.getInfo1(page) for item in info: value.append(item[0] + ':' + self.tool.replace(item[1])) info2 = self.getInfo2(page) for item in info2: #print item[0] + ':' + self.tool.replace(item[1]) value.append(item[0] + ':' + re.sub(r'[\n\t]+',r'\n', self.tool.replace(item[1]), flags=re.S)) for x in range(4 - len(info2)): value.append('') #teacher = self.getTeacher(page) # print teacher teacherinfo = self.getTeacherInfo(page) teacher = "" for item in teacherinfo: str = item[0] + '\n' + item[1] + '\n' + self.tool.replace(item[2]) + '\n' teacher = teacher + str value.append(teacher) value.append(url) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def create_database(self): """ create database """ mysql = mysql_helper.MysqlHelper(host = self.mysql_host,\ port = self.mysql_port, user = self.mysql_user, \ passwd = self.mysql_passwd) create_database_sql = "CREATE DATABASE IF NOT EXISTS %s" % self.test_db try: mysql.execute_withnodb(create_database_sql) logging.info("create the database %s sucess~~" % self.test_db) except Exception as exception: logging.error("create the database %s failed~~" % self.test_db)
def loadFoodPages(): """ 爬取页码 """ sql = "select class_url, id from lb_food_class where level = %s and total_page = %s" params = (2, 0) print(sql % params) rows = MysqlHelper.MysqlHelper().fetchall(sql, params) for row in rows: class_url, id = row request = urllib2.Request(class_url, headers=headers) html = urllib2.urlopen(request).read() page = re.compile(r'共(\d*)页').match(html).group(1) sql = "update lb_food_class set total_page =%s where id = %s" params = (page, id) print(sql % params) MysqlHelper.MysqlHelper().cud(sql, params) sleepRandom()
def __init__(self, configFilePath): self.configFilePath = configFilePath self.dbHelper = None # self.dbHelper = MysqlHelper.DB("localhost", 3306, "root", "", "metadata") self.dbHelper = MysqlHelper.DB( readInXml.getElement(self.configFilePath, 'DatabaseConfig', 'host'), int( readInXml.getElement(self.configFilePath, 'DatabaseConfig', 'port')), readInXml.getElement(self.configFilePath, 'DatabaseConfig', 'userName'), "", readInXml.getElement(self.configFilePath, 'DatabaseConfig', 'dbName'))
def loadFoodClass2(): """ 爬取第二分类 """ sql = "select class_url, id from lb_food_class where level = %s" params = (1,) print(sql % params) rows = MysqlHelper.MysqlHelper().fetchall(sql, params) for row in rows: class_url, parentId = row # print (url, parentId) dlList = getContent(class_url).xpath('//div[@class="main"]/div/div/dl') for dl in dlList: tag = dl.xpath("dt/text()")[0] aList = dl.xpath("dd/a") for a in aList: text = a.text class_url = a.attrib.get("href") sql = "insert into lb_food_class ( name , parent_id , level , class_url , tag)" \ " values ( %s, %s, %s, %s, %s)" params = (text, parentId, 2, class_url, tag) print(sql % params) MysqlHelper.MysqlHelper().cud(sql, params)
def loadFoodMaterialListPage(url, class1_id, class2_id): divList = getContent(url).xpath('//div[@class="listtyle1"]') for div in divList: html_url = div.xpath('div[@class="img"]/a/@href')[0] logo_url = div.xpath("div/a/img/@src")[0] title = div.xpath('div[@class="info1"]/h3/a/text()')[0] description = "" descriptionList = div.xpath('div[@class="info1"]/div/span/text()') for d in descriptionList: description = d sql = "insert into lb_food_material ( name , class1_id , class2_id ,description, html_url," \ " logo_url) values ( %s, %s, %s, %s, %s, %s)" params = (title, class1_id, class2_id, description, html_url, logo_url) # print(sql % params) MysqlHelper.MysqlHelper().cud(sql, params)
def spiderComment(food_id, content): # 爬取评论数据 comlist = content.xpath("//div[@class='cp_comlist_w']/ul/li") count = 0 for c in comlist: content = c.xpath(".//p/strong")[0].tail.encode("utf-8").strip() commentTime = "" comm_time = c.xpath(".//div/span/text()")[0].encode("utf-8") m = re.compile(r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})来自').findall(comm_time) if len(m) > 0: commentTime = m[0] comment_time = datetime.datetime.strptime(commentTime, '%Y-%m-%d %H:%M:%S') user_name = "" user_name_list = c.xpath("a/h5/text()") if len(user_name_list) > 0: user_name = user_name_list[0] user_avatar_url = c.xpath("a/img/@src")[0] comHref = c.xpath("a/@href") user_id = 0 user_url = "" if len(comHref) > 0: com_user_url = comHref[0] m = re.compile(r'.*id=(\d*)').match(com_user_url) if m is not None: user_id = m.group(1) user_url = com_user_url # print(comment) # print(comment_time) # print(user_name) # print(user_avatar_url) # print(user_id) # print(user_url) sql = "insert into lb_food_comment ( food_id , user_name , user_id , user_avatar_url ," \ " user_url, content, comment_time) values ( %s, %s, %s, %s, %s, %s, %s)" params = (food_id, user_name, user_id, user_avatar_url, user_url, content, comment_time) # print(sql % params) if MysqlHelper.MysqlHelper(isTest).cud(sql, params) == 1: count += 1 return count == len(comlist)
def loadFoodClass1(): """ 爬取第一分类 """ url = "https://www.meishij.net/jiankang/" dlList = getContent(url).xpath('//div[@class="nav"]/ul/li[2]/div/div/div/dl') for dl in dlList: aList = dl.xpath("dt/a") for a in aList: text = a.text class_url = a.attrib.get("href") sql = "insert into lb_food_class ( name , level , class_url ) values ( %s, %s, %s)" params = (text, 1, class_url) print(sql % params) MysqlHelper.MysqlHelper().cud(sql, params)
def write2SQL(item): """ 将返回的数据插入到数据库中 :param item: :return: """ dbhelper = MysqlHelper.DbHelper() title = item['title'] actor = item['stars'].split(":")[1] time = item['releasetime'].split(":")[1] sql = "INSERT INTO newdatabase.maoyan(title,actor,time) VALUES(%s,%s,%s)" params = (title, actor, time) result = dbhelper.execute(sql, params) if result == True: print("插入成功") else: print("插入失败")
def spiderMaterial(food_id, content): divList = content.xpath("//div[@class='materials_box']/div") count = 0 allCount = 0 for div in divList: goods_ad_list = div.xpath("./@class") if len(goods_ad_list) > 0 and goods_ad_list[0] == "goods_ad": continue tag = div.xpath("h3/a/text()")[0] liList = div.xpath("ul/li") allCount += len(liList) for li in liList: thumbnail_url = "" name = "" dosage = "" thumbnail_url_list = li.xpath("a/img/@src") if len(thumbnail_url_list) > 0: thumbnail_url = thumbnail_url_list[0] name_list = li.xpath("div/h4/a/text()") if len(name_list) > 0: name = name_list[0] if name == "": name_list = li.xpath("h4/a/text()") if len(name_list) > 0: name = name_list[0] dosage_list = li.xpath("div/h4/span/text()") if len(dosage_list) > 0: dosage = dosage_list[0] if dosage == "": dosage_list = li.xpath("span/text()") if len(dosage_list) > 0: dosage = dosage_list[0] sql = "insert into lb_food_material_assoc ( food_id , tag, name, thumbnail_url, dosage)" \ " values ( %s, %s, %s, %s, %s )" params = (food_id, tag, name, thumbnail_url, dosage) # print(sql % params) if MysqlHelper.MysqlHelper(isTest).cud(sql, params) == 1: count += 1 return allCount == count
def loadFoodDetail(): sql = "select id, html_url from lb_food where is_spider_detail = %s limit %s, %s " params = (0, pageSize * (page - 1), pageSize * page) print(sql % params) rows = MysqlHelper.MysqlHelper(isTest).fetchall(sql, params) for row in rows: food_id, html_url = row try: content = getContent(html_url) # global isTest # isTest = True # content = getContent("https://www.meishij.net/zuofa/liangbankugua_37.html") if spiderBaseInfo(food_id, content): print("爬取基本信息成功") if spiderComment(food_id, content): print("爬取评论数据成功") if spiderProcess(food_id, content): print("爬取做法成功") if spiderMaterial(food_id, content): print("爬取用料成功") # 更新状态 updateSpiderDetail("爬取成功", food_id, True) sleepRandom() continue else: s = "爬取用料出错" print(s) updateSpiderDetail(s, food_id) else: error_msg = "爬取做法出错" print(error_msg) updateSpiderDetail(error_msg, food_id) else: s1 = "爬取评论数据出错" print(s1) updateSpiderDetail(s1, food_id) else: s2 = "爬取基本信息出错" print(s2) updateSpiderDetail(s2, food_id) except Exception as e: updateSpiderDetail(e.message, food_id) sleepRandom()
def truncate_table_sql(model, batch_size): """ init """ try: cf = ConfigParser.ConfigParser() cf.read("./conf/load_config.conf") mysql_host = cf.get("db", "mysql_host") mysql_port = cf.getint("db", "mysql_port") mysql_user = cf.get("db", "mysql_user") mysql_passwd = cf.get("db", "mysql_passwd") conf_name = "conf_%s" % model try: test_db = cf.get(conf_name, "test_db") % batch_size except Exception as e: sys.exit(1) mysql = mysql_helper.MysqlHelper(host = mysql_host,\ port = mysql_port, user = mysql_user, \ passwd = mysql_passwd, db = test_db) except Exception as exception: print exception return table_name=["anakin2_yolo_time_satistic_k1200", "anakin2_yolo_time_satistic_p4", \ "anakin_tensorrt_time_satistic_k1200", "anakin_tensorrt_time_satistic_p4", \ "nvidia_list_1sec_k1200", "nvidia_list_1sec_p4", \ "nvidia_list_1sec_version_k1200", "nvidia_list_1sec_version_p4", \ "nvidia_list_1sec_version_tensorRT_k1200", "nvidia_list_1sec_version_tensorRT_p4", \ "top_list_1sec_avg_k1200", "top_list_1sec_avg_p4", \ "top_list_1sec_avg_tensorRT_k1200", "top_list_1sec_avg_tensorRT_p4", \ "top_list_1sec_k1200", "top_list_1sec_p4", \ "log_monitor_k1200", "log_monitor_p4"] for item in table_name: truncate_sql = "truncate table %s" % (item) print "[INFO]: start truncate the sql" try: truncate_result = mysql.executes(truncate_sql) print("[INFO]: truncate %s success!!!" % item) print truncate_result except Exception as exception: print("[ERROR]: truncate %s error!!!" % item)
def __init__(self, db_name): """ init """ self.config = {} try: cf = ConfigParser.ConfigParser() cf.read("../../conf/load_config.conf") self.mysql_host = cf.get("db", "mysql_host") self.mysql_port = cf.getint("db", "mysql_port") self.mysql_user = cf.get("db", "mysql_user") self.mysql_passwd = cf.get("db", "mysql_passwd") self.test_db = db_name self.mysql = mysql_helper.MysqlHelper(host = self.mysql_host,\ port = self.mysql_port, user = self.mysql_user, \ passwd = self.mysql_passwd, db = self.test_db) except Exception as exception: print exception return
def getNewsDetail(self, news_url): """ 获取新闻具体细节 Args: search_url:每条新闻具体链接 Returns: 新闻链接、标题、正文、时间、日期 """ result = {} res = requests.get(news_url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') if (len(soup.select('.article-title h2')) > 0 and len(soup.select('.date')[0]) > 0 and len(soup.select('.time')[0]) > 0): result['url'] = news_url result["title"] = soup.select('.article-title h2')[0].text result["date"] = soup.select('.date')[0].text.lstrip('发布时间:') result["time"] = soup.select('.time')[0].text #source = soup.select('.account-authentication')[0].text #print(title,date,time,source) if len(result["date"]) == 5: result["datetime"] = "2020-" + result["date"] + " " + result[ "time"] elif len(result["date"]) == 8: result[ "datetime"] = "20" + result["date"] + " " + result["time"] else: result["datetime"] = result["date"] + " " + result["time"] result["article"] = self.getArticle(news_url) msh = MysqlHelper.MysqlHelper(host="localhost", username="******", password="******", db="baiduSearchNews", charset="utf8", port=3306) msh.connect() sql = "insert into disease values('%s','%s','%s','%s')" % ( result["article"], result["datetime"], result["title"], result['url']) msh.insert(sql)
def start(self): content = json.loads(self.getContent()) conn = MysqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists xjtu') cur.execute('create table if not exists xjtu(id int(11) primary key auto_increment,title varchar(255),lesson_code varchar(255),start_time varchar(255),current_sem varchar(255),spend_time varchar(255),short_desc text,knowledge_res text,chapter_info text,common_prob text,teacher_info text,url varchar(255))') sql = 'insert into xjtu(title,lesson_code,start_time,current_sem,spend_time,short_desc,knowledge_res,chapter_info,common_prob,teacher_info,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' for item in content["course"]: oneline = Course() url = "http://xjtu.xuetangx.com" + item["about"] page = self.getPage(url) title = self.getTitle(page) oneline.title = title info = self.getInfo1(page) for item in info: if item[0] == "课程代码": oneline.lesson_code = self.tool.replace(item[1]) if item[0] == "开课时间": oneline.start_time = self.tool.replace(item[1]) if item[0] == "当前学期": oneline.current_sem = self.tool.replace(item[1]) if item[0] == "投入时间": oneline.spend_time = self.tool.replace(item[1]) info2 = self.getInfo2(page) for item in info2: if item[0] == "课程简介": oneline.short_desc = re.sub(r'[\n\t]+',r'\n', self.tool.replace(item[1]), flags=re.S) if item[0] == "知识储备": oneline.knowledge_res = re.sub(r'[\n\t]+',r'\n', self.tool.replace(item[1]), flags=re.S) if item[0] == "章节信息": oneline.chapter_info = re.sub(r'[\n\t]+',r'\n', self.tool.replace(item[1]), flags=re.S) if item[0] == "常见问题": oneline.common_prob = re.sub(r'[\n\t]+',r'\n', self.tool.replace(item[1]), flags=re.S) teacherinfo = self.getTeacherInfo(page) teacher = "" if teacherinfo: for item in teacherinfo: str = item[0] + '\n' + item[1] + '\n' + self.tool.replace(item[2]) + '\n' teacher = teacher + str oneline.teacher_info = teacher oneline.url = url value = [] value.append(oneline.title) value.append(oneline.lesson_code) value.append(oneline.start_time) value.append(oneline.current_sem) value.append(oneline.spend_time) value.append(oneline.short_desc) value.append(oneline.knowledge_res) value.append(oneline.chapter_info) value.append(oneline.common_prob) value.append(oneline.teacher_info) value.append(oneline.url) MysqlHelper.insert_one(cur,sql,value) if content["lecture"]: cur.execute('drop table if exists xjtu_lecture') cur.execute('create table if not exists xjtu_lecture(id int(11) primary key auto_increment,title varchar(255),intro text,guest text,video_info text,addr text,url varchar(255))') sql = 'insert into xjtu_lecture(title,intro,guest,video_info,addr,url) values(%s,%s,%s,%s,%s,%s)' for item in content["lecture"]: oneline = Lecture() url = "http://xjtu.xuetangx.com" + item["about"] page = self.getPage(url) title = self.getLectureTitle(page) oneline.title = title intro = self.getLectureIntro(page) oneline.intro = self.tool.replace(intro) guest = self.getLectureGuest(page) oneline.guest = self.tool.replace(guest) videoInfo = self.getVideoInfo(page) oneline.video_info = self.tool.replace(videoInfo) addr = self.getLectureInfo(page) oneline.addr = self.tool.replace(addr) oneline.url = url value = [] value.append(oneline.title) value.append(oneline.intro) value.append(oneline.guest) value.append(oneline.video_info) value.append(oneline.addr) value.append(oneline.url) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)
def start(self): indexPage = self.getContent(1) # print indexPage pageNum = self.getPageNum(indexPage) # print pageNum[-2] conn = MysqlHelper.connect() cur = conn.cursor() # cur.execute('drop table if exists mooc') # cur.execute('create table mooc(id int(11) primary key auto_increment,title_chinese varchar(255),title_english varchar(255),brief text,teacher text,chapter text,requires text,form text,question text,resource text,url varchar(255))') sql = 'insert into mooc(title_chinese,title_english,brief,teacher,chapter,requires,form,question,resource,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' for i in range(78,int(pageNum[-2])+1): # for i in range(1,2): indexPage = self.getContent(i) URL = self.getURL(indexPage) for item in URL: oneline = Item() page = self.getPage(item) # print page title_chinese = self.getTitle_chinese(page) oneline.title_chinese = title_chinese title_english = self.getTitle_english(page) oneline.title_english = title_english # print title_chinese,title_english block = self.getBlock(page) #添加的</div>用户判断文本结尾 block = block + '</div>' # print block + '\n' briefWords = ["课程概述","课程概况","课程简介"] if self.wordInText(block,briefWords): brief = self.getText(block,briefWords) oneline.brief = self.tool.replace(brief) # print brief teacherWords = ["授课教师","主讲教师"] if self.wordInText(block,teacherWords): teacher = self.getText(block,teacherWords) oneline.teacher = self.tool.replace(teacher) # print teacher chapterWords = ["授课大纲","课程大纲"] if self.wordInText(block,chapterWords): chapter = self.getText(block,chapterWords) oneline.chapter = self.tool.replace(chapter) # print chapter requireWords = ["先修要求","先修知识","背景知识"] if self.wordInText(block,requireWords): require = self.getText(block,requireWords) oneline.require = self.tool.replace(require) # print require formWords = ["授课形式"] if self.wordInText(block,formWords): form = self.getText(block,formWords) oneline.form = self.tool.replace(form) # print form questionWords = ["常见问题解答","常见问题"] if self.wordInText(block,questionWords): question = self.getText(block,questionWords) oneline.question = self.tool.replace(question) # print question resourceWords = ["参考资料"] if self.wordInText(block,resourceWords): resource = self.getText(block,resourceWords) oneline.resource = self.tool.replace(resource) # print self.tool.replace(resource) #url oneline.url = item value = [] value.append(oneline.title_chinese) value.append(oneline.title_english) value.append(oneline.brief) value.append(oneline.teacher) value.append(oneline.chapter) value.append(oneline.require) value.append(oneline.form) value.append(oneline.question) value.append(oneline.resource) value.append(oneline.url) MysqlHelper.insert_one(cur,sql,value) MysqlHelper.finish(conn)