def getAllCourse(): #get first course list page and the value of total page url = "http://www.chinesemooc.org/api/search_by_classid.php?classid=all"; user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" header = {'User-Agent':user_agent} html_page = getHtml(url, header) if html_page == None: print("ERROR:Get data from Chinese Mooc failed...") return course_list = eval(html_page) total_page_num = course_list['msg']['page_total'] if course_list.has_key('msg') and course_list['msg'].has_key('page_total'): total_page_num = course_list['msg']['page_total'] conn = MySqlHelper.connect() cur = conn.cursor() cur.execute('drop table if exists chinesemooc') cur.execute('create table chinesemooc(course_id int(11) primary key,course_title varchar(255),\ course_term varchar(255),course_outline text,course_view_num int(11),course_comment_num int(11),\ course_price int(11),signup int(11),course_des text,teacher_info text,assistant text,school varchar(255))') sql = 'insert into chinesemooc(course_id,course_title,course_term,course_outline,course_view_num,course_comment_num,\ course_price,signup,course_des,teacher_info,assistant,school) values(%d,"%s","%s","%s",%d,%d,%d,%d,"%s","%s","%s","%s")' getAllCourseInfo(course_list,header,cur,sql) for page_index in range(2,total_page_num+1): page_url =url + "&page=" + str(page_index) tmp_page = getHtml(page_url,header) if tmp_page == None: continue tmp_course_list = eval(tmp_page) getAllCourseInfo(tmp_course_list,header,cur,sql) time.sleep(10) MySqlHelper.finish(conn)
def getAllCourseInfo(course_list_dic,header,cur,sql): if course_list_dic == None: return if not course_list_dic.has_key('msg'): return if not course_list_dic['msg'].has_key('list'): return course_list = course_list_dic['msg']['list'] course_num = len(course_list) file = open("course.txt","w+") item_list = [] for course_index in range(0,course_num): item = Item() item.course_id = int(course_list[course_index]['kvideoid']) print(item.course_id) item.course_view_num = int(course_list[course_index]['viewnum']) print(item.course_view_num) item.course_title = course_list[course_index]['subject'] item.course_title = uni2utf(item.course_title) print(item.course_title) item.course_price = int(course_list[course_index]['price']) print(item.course_price) item.course_signup = int(course_list[course_index]['signup']) print(item.course_signup) item.comment_num = int(course_list[course_index]['comment_num']) print(item.comment_num) item.course_des = uni2utf(course_list[course_index]['kvideo_desc']) print(item.course_des) item.school = course_list[course_index]['teacher_info']['school_name'] item.school = uni2utf(item.school) print(item.school) tmp_url = "http://www.chinesemooc.org/mooc/"+str(item.course_id) parseCoursePage(tmp_url,header,item) item_list.append(item) #sleep #time.sleep(3) for record_item in item_list: value = [] if record_item.course_id == 4407: str1 = "test" value.append(record_item.course_id) value.append(record_item.course_title) value.append(record_item.course_term) value.append(record_item.course_outline) value.append(record_item.course_view_num) value.append(record_item.course_comment_num) value.append(record_item.course_price) value.append(record_item.signup) value.append(record_item.course_des) value.append(record_item.teacher_info) value.append(record_item.assistant) value.append(record_item.school) test = sql% (value[0],value[1],value[2],value[3],value[4],value[5],value[6],value[7],value[8],value[9],value[10],value[11],) MySqlHelper.insert_one(cur,test,None)
def __init__(self, url): self.url = url self.sql = MySqlHelper.MySQL_Utils() self.header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Connection': 'keep-alive' }
def get_now_time(self): # 连接阿里云mysql数据库,公网IP为47.96.104.151,端口3306,数据库noodle,表user # 表user-->>id,username,password,loigntime,idDelete self.sqlhelp = MySqlHelper.MySqlHelper('47.96.104.151', 3306, 'root', 'mysql', 'noodle') machinesql = "select * from machines where sn='30038935'" machinelist = self.sqlhelp.get(machinesql, []) starttime = QDateTime.currentDateTime().toString("yyyy-MM-dd hh:mm:ss") starttimesql = "update machines set starttime=%s where sn='30038935'" self.sqlhelp.cud(starttimesql, [starttime]) print(starttime)
def main(): print('请输入想要爬取页数:') # 修改 可输入bv号,视频链接,视频名称 sPage = int(input()) html_text = __GetHtml__(sPage) IPList = __GetIPList__(html_text) oResultTuple = tuple() oResultTuple = __GetIPInfo__(IPList) mysql = MySqlHelper.DBHelper(flag=1) # for iTuple in oResultTuple: count = mysql.ExecuteNonQryText("free_agent_IP_ins.sql", oResultTuple) print('爬取结果:') print("共{0}条".format(count))
def write2SQL(item): """ 把数据插入到数据库中 """ dbhelper = MySqlHelper.DBHelper() title = item['title'] actor = item['stars'] time = item['releasetime'] sql = "INSERT INTO testdb.maoyan(title,actor,time) VALUES(%s,%s,%s)" params = (title, actor, time) result = dbhelper.execute(sql, params) if result == True: print("插入成功") else: print("插入失败")
def __init__(self): import pymysql pymysql.install_as_MySQLdb() self.sqlHelper = MySqlHelper.MySQL_Utils()