def need_instant_update(): cursor = mydb.cursor() sql = 'select * from user_click where times>10' cursor.execute(sql) results = cursor.fetchall() if len(results) > 0: return True else: return False
def crawl(url, webid): driver = get_driver() try: cursor=mydb.cursor() driver.get(url) time.sleep(6) driver.implicitly_wait(30) # driver.set_page_load_timeout(30) # driver.set_script_timeout(30) js_down = "var q=document.documentElement.scrollTop=100000" js_up = "var q=document.documentElement.scrollTop=0" for i in range(6): driver.execute_script(js_down) time.sleep(2) driver.execute_script(js_up) time.sleep(1) soup = BeautifulSoup(driver.page_source, 'lxml') # cursor.execute('select articleid from article ORDER BY articleid DESC LIMIT 1') # result = cursor.fetchall()[0][0] error = 0 results=getdata(soup,webid) for i in results: try: cursor.execute("insert into article(title,url,webid,source,imgurl,date) values (%s,%s,%s,%s,%s,%s)",i) mydb.commit() except: error=error+1 print(datetime.datetime.now(),' 更新文章,成功:%d 篇, 已存在:%d 篇。' % (len(results)-error,error) ) error=0 cursor.execute('select articleid,title from article ORDER BY articleid DESC LIMIT 100') results = cursor.fetchall() for i in results: # if i[0]==result: # break seg_list = list(jieba.cut(i[1],cut_all = True)) for item in seg_list: try: cursor.execute('insert into article_keyword(articleid, keyword) VALUES (%s,%s)', (i[0], item,)) mydb.commit() cursor.execute('update keyword set fever=fever+1 where name=%s',(item,)) mydb.commit() except: error=error+1 print(datetime.datetime.now(),' 更新关键词完成') except Exception as e: print(str(e)) driver.quit()
def user_keyword_update(action): cursor = mydb.cursor() if action == 1: sql = 'select * from user_keyword_daily_view' sql2 = 'delete from user_keyword where exists(select * from user_click where user_click.userid=user_keyword.userid)' else: sql = 'select * from user_keyword_update_instant_view' sql2 = 'delete from user_keyword where exists(select * from user_click where user_click.userid=user_keyword.userid and times>10)' cursor.execute(sql) results = cursor.fetchall() cursor.execute(sql2) mydb.commit() lists = [] if len(results) == 0: print(datetime.datetime.now(), '没有可更新的用户') return for item in results: userid = item[0] keyword = item[1] date = item[2] times = item[3] fever = math.log((1 + date) / MAX_DATA) / math.log(1 / 60) * times lists.append({'userid': userid, 'keyword': keyword, 'fever': fever}) df = pandas.DataFrame(lists) data = df.groupby(['userid', 'keyword']).sum() data2 = data.groupby('userid') user = 0 for name, item in data2: user = user + 1 dic = item.to_dict()['fever'] list = [] for i in dic: list.append((name, i[1], dic[i])) list.sort(key=lambda a: a[2], reverse=True) for i in list[0:20]: sql = 'insert into user_keyword(userid, keyword, fever) VALUES (%s,%s,%s)' cursor.execute(sql, i) mydb.commit() if action == 1: sql = 'delete from user_click' else: sql = 'delete from user_click where times>10' cursor.execute(sql) mydb.commit() print(datetime.datetime.now(), ' 更新用户模型成功,共更新%d个用户' % (user))
def video_crawler(url, webid): driver = get_driver() try: cursor=mydb.cursor() driver.get(url) time.sleep(6) driver.implicitly_wait(30) # driver.set_page_load_timeout(30) # driver.set_script_timeout(30) js_down = "var q=document.documentElement.scrollTop=100000" js_up = "var q=document.documentElement.scrollTop=0" driver.execute_script(js_down) time.sleep(2) driver.execute_script(js_up) time.sleep(1) soup = BeautifulSoup(driver.page_source, 'lxml') # cursor.execute('select articleid from article ORDER BY articleid DESC LIMIT 1') # result = cursor.fetchall()[0][0] error = 0 resultss=get_data(soup,webid) for i in resultss: print(i) for i in resultss: try: sql = 'insert into video(url, webid, source, date, fever, authorimgurl, title) ' \ 'VALUES (%s,%s,%s,%s,%s,%s,%s)' cursor.execute(sql,i) mydb.commit() except Exception as e: print(e) try: sql='update video set fever=? where url= ?' cursor.execute(sql,(i[4],i[0],)) except Exception as e: print(e) error=error+1 print(datetime.datetime.now(),' 更新视频,成功:%d 个, 已存在:%d 个。' % (len(resultss)-error,error) ) except Exception as e: print(str(e)) driver.quit()
import struct import os from settings import mydb import mysql.connector cursor = mydb.cursor() # 由于原代码不适用python3且有大量bug # 以及有函数没有必要使用且一些代码书写不太规范或冗余 # 所以本人在原有的大框架基本不动的情况下作了大量的细节更改。 # 使得没有乱码出现,文件夹导入更方便等等。 # Author:Ling Yue, Taiyuan U of Tech # Blog: http://blog.yueling.me # 原作者: # 搜狗的scel词库就是保存的文本的unicode编码,每两个字节一个字符(中文汉字或者英文字母) # 找出其每部分的偏移位置即可 # 主要两部分 # 1.全局拼音表,貌似是所有的拼音组合,字典序 # 格式为(index,len,pinyin)的列表 # index: 两个字节的整数 代表这个拼音的索引 # len: 两个字节的整数 拼音的字节长度 # pinyin: 当前的拼音,每个字符两个字节,总长len # # 2.汉语词组表 # 格式为(same,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表 # same: 两个字节 整数 同音词数量 # py_table_len: 两个字节 整数 # py_table: 整数列表,每个整数两个字节,每个整数代表一个拼音的索引 # # word_len:两个字节 整数 代表中文词组字节数长度 # word: 中文词组,每个中文汉字两个字节,总长度word_len # ext_len: 两个字节 整数 代表扩展信息的长度,好像都是10