Esempio n. 1
0
def need_instant_update():
    cursor = mydb.cursor()
    sql = 'select * from user_click where times>10'
    cursor.execute(sql)
    results = cursor.fetchall()
    if len(results) > 0:
        return True
    else:
        return False
Esempio n. 2
0
def crawl(url, webid):
    driver = get_driver()
    try:
        cursor=mydb.cursor()
        driver.get(url)
        time.sleep(6)
        driver.implicitly_wait(30)
        # driver.set_page_load_timeout(30)
        # driver.set_script_timeout(30)
        js_down = "var q=document.documentElement.scrollTop=100000"
        js_up = "var q=document.documentElement.scrollTop=0"
        for i in range(6):
            driver.execute_script(js_down)
            time.sleep(2)
            driver.execute_script(js_up)
            time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'lxml')
        # cursor.execute('select articleid from article ORDER BY articleid DESC LIMIT 1')
        # result = cursor.fetchall()[0][0]

        error = 0
        results=getdata(soup,webid)
        for i in results:
            try:
                cursor.execute("insert into article(title,url,webid,source,imgurl,date) values (%s,%s,%s,%s,%s,%s)",i)
                mydb.commit()
            except:
                error=error+1
        print(datetime.datetime.now(),' 更新文章,成功:%d 篇, 已存在:%d 篇。' % (len(results)-error,error) )
        error=0
        cursor.execute('select articleid,title from article ORDER BY articleid DESC LIMIT 100')
        results = cursor.fetchall()
        for i in results:
            # if i[0]==result:
            #     break
            seg_list = list(jieba.cut(i[1],cut_all = True))
            for item in seg_list:
                try:
                    cursor.execute('insert into article_keyword(articleid, keyword) VALUES (%s,%s)', (i[0], item,))
                    mydb.commit()
                    cursor.execute('update keyword set fever=fever+1 where name=%s',(item,))
                    mydb.commit()
                except:
                    error=error+1
        print(datetime.datetime.now(),' 更新关键词完成')
    except Exception as e:
        print(str(e))
    driver.quit()
Esempio n. 3
0
def user_keyword_update(action):
    cursor = mydb.cursor()
    if action == 1:
        sql = 'select * from user_keyword_daily_view'
        sql2 = 'delete from user_keyword where exists(select * from user_click where user_click.userid=user_keyword.userid)'

    else:
        sql = 'select * from user_keyword_update_instant_view'
        sql2 = 'delete from user_keyword where exists(select * from user_click where user_click.userid=user_keyword.userid and times>10)'

    cursor.execute(sql)
    results = cursor.fetchall()
    cursor.execute(sql2)
    mydb.commit()
    lists = []
    if len(results) == 0:
        print(datetime.datetime.now(), '没有可更新的用户')
        return
    for item in results:
        userid = item[0]
        keyword = item[1]
        date = item[2]
        times = item[3]
        fever = math.log((1 + date) / MAX_DATA) / math.log(1 / 60) * times
        lists.append({'userid': userid, 'keyword': keyword, 'fever': fever})
    df = pandas.DataFrame(lists)
    data = df.groupby(['userid', 'keyword']).sum()
    data2 = data.groupby('userid')
    user = 0
    for name, item in data2:
        user = user + 1
        dic = item.to_dict()['fever']
        list = []
        for i in dic:
            list.append((name, i[1], dic[i]))
        list.sort(key=lambda a: a[2], reverse=True)
        for i in list[0:20]:
            sql = 'insert into user_keyword(userid, keyword, fever) VALUES (%s,%s,%s)'
            cursor.execute(sql, i)
            mydb.commit()
    if action == 1:
        sql = 'delete from user_click'
    else:
        sql = 'delete from user_click where times>10'

    cursor.execute(sql)
    mydb.commit()
    print(datetime.datetime.now(), ' 更新用户模型成功,共更新%d个用户' % (user))
Esempio n. 4
0
def video_crawler(url, webid):
    driver = get_driver()
    try:
        cursor=mydb.cursor()
        driver.get(url)
        time.sleep(6)
        driver.implicitly_wait(30)
        # driver.set_page_load_timeout(30)
        # driver.set_script_timeout(30)
        js_down = "var q=document.documentElement.scrollTop=100000"
        js_up = "var q=document.documentElement.scrollTop=0"
        driver.execute_script(js_down)
        time.sleep(2)
        driver.execute_script(js_up)
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'lxml')
        # cursor.execute('select articleid from article ORDER BY articleid DESC LIMIT 1')
        # result = cursor.fetchall()[0][0]

        error = 0
        resultss=get_data(soup,webid)
        for i in resultss:
            print(i)
        for i in resultss:
            try:
                sql = 'insert into video(url, webid, source, date, fever, authorimgurl, title) ' \
                      'VALUES (%s,%s,%s,%s,%s,%s,%s)'
                cursor.execute(sql,i)
                mydb.commit()
            except Exception as e:
                print(e)
                try:
                    sql='update video set fever=? where url= ?'
                    cursor.execute(sql,(i[4],i[0],))
                except Exception as e:
                    print(e)
                error=error+1
        print(datetime.datetime.now(),' 更新视频,成功:%d 个, 已存在:%d 个。' % (len(resultss)-error,error) )
    except Exception as e:
        print(str(e))
    driver.quit()
Esempio n. 5
0
import struct
import os
from settings import mydb
import mysql.connector
cursor = mydb.cursor()
# 由于原代码不适用python3且有大量bug
# 以及有函数没有必要使用且一些代码书写不太规范或冗余
# 所以本人在原有的大框架基本不动的情况下作了大量的细节更改。
# 使得没有乱码出现,文件夹导入更方便等等。
# Author:Ling Yue, Taiyuan U of Tech
# Blog: http://blog.yueling.me

# 原作者:
# 搜狗的scel词库就是保存的文本的unicode编码,每两个字节一个字符(中文汉字或者英文字母)
# 找出其每部分的偏移位置即可
# 主要两部分
# 1.全局拼音表,貌似是所有的拼音组合,字典序
#       格式为(index,len,pinyin)的列表
#       index: 两个字节的整数 代表这个拼音的索引
#       len: 两个字节的整数 拼音的字节长度
#       pinyin: 当前的拼音,每个字符两个字节,总长len
#
# 2.汉语词组表
#       格式为(same,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表
#       same: 两个字节 整数 同音词数量
#       py_table_len:  两个字节 整数
#       py_table: 整数列表,每个整数两个字节,每个整数代表一个拼音的索引
#
#       word_len:两个字节 整数 代表中文词组字节数长度
#       word: 中文词组,每个中文汉字两个字节,总长度word_len
#       ext_len: 两个字节 整数 代表扩展信息的长度,好像都是10