Exemple #1
0
def add_search_cont(search_list):
    save_sql = (
        'insert into weibo_search (mk_primary,mid,murl,create_time,praise_count,repost_count,comment_count,'
        'content,device,user_id,username,uheadimage,user_home,keyword) values(:mk_primary, :mid, '
        ':murl, :create_time, :praise_count,:repost_count, :comment_count, :content, :device, '
        ':user_id, :username,:uheadimage, :user_home, :keyword)')
    with db_connect.db_execute() as conn:

        for search_cont in search_list:
            search_info = {
                'mk_primary': search_cont.mk_primary,
                'mid': search_cont.mid,
                'murl': search_cont.murl,
                'create_time': search_cont.create_time,
                'praise_count': search_cont.praise_count,
                'repost_count': search_cont.repost_count,
                'comment_count': search_cont.comment_count,
                'content': search_cont.content,
                'device': search_cont.device,
                'user_id': search_cont.user_id,
                'username': search_cont.username,
                'uheadimage': search_cont.uheadimage,
                'user_home': search_cont.user_home,
                'keyword': search_cont.keyword
            }
            try:
                db_connect.db_dml_parms(conn, save_sql, search_info)
            except Exception as why:
                storage.error('插入出错,具体原因为:{why}, 插入数据是{info}'.format(
                    why=why, info=search_info.__dict__))
Exemple #2
0
def update_repost_comment(**kwargs):
    mid = kwargs.get('mid')
    reposts = kwargs.get('reposts')
    comments = kwargs.get('comments')
    sql = 'select se_repost_count, se_comment_count from weibo_search_data where se_mid = :mid'
    args = dict(mid=mid)

    with db_connect.db_execute() as conn:
        rs = db_connect.db_queryone_params(conn, sql, args)
        if reposts != rs[0] or comments != rs[1]:
            update_sql = (
                'update weibo_search_data set se_repost_count = :reposts, se_comment_count = :comments '
                'where se_mid = :mid')
            update_args = dict(mid=mid, reposts=reposts, comments=comments)
            db_connect.db_dml_parms(conn, update_sql, update_args)
Exemple #3
0
def get_crawl_urls():
    """
    :return: is_crawled = 0的字段,即需要进行扩散分析的字段
    """
    sql = (
        'select se_userid,se_sid, se_mid from weibo_search_data where is_crawled = 0 and '
        'se_sourcetype = \'新浪微博\' order by se_createtime desc')

    datas = list()
    with db_connect.db_execute() as conn:
        rs = db_connect.db_queryall(conn, sql)
        for r in rs:
            data = {
                'url': 'http://weibo.com/' + r[0] + '/' + r[1],
                'mid': r[2]
            }
            datas.append(data)

    return datas
Exemple #4
0
def get_repost_comment(mid):
    sql = 'select se_repost_count, se_comment_count from weibo_search_data where se_mid = :mid'
    args = dict(mid=mid)
    with db_connect.db_execute() as con:
        rs = db_connect.db_queryone_params(con, sql, args)
    return rs
Exemple #5
0
def update_weibo_url(mid):
    sql = "update weibo_search_data set is_crawled = 1 where se_mid = :mid"
    args = {'mid': str(mid)}
    with db_connect.db_execute() as conn:
        db_connect.db_dml_parms(conn, sql, args)
Exemple #6
0
def save(user, mid, post_time, source, reposts_count, comments_count,
         root_url):
    """
    :param user: 用户对象
    :param mid: 微博id
    :param post_time: 发表时间
    :param source: 网页源码
    :param reposts_count: 转发数
    :param comments_count: 评论数
    :param root_url: 源微博URL
    :return: 返回的结果用于判断是否需要进行微博扩散的抓取
    """
    select_sql = "select * from weibo_spread_original where status_mid = '{mid}'".format(
        mid=str(mid))
    child_sql = "select count(*) from weibo_spread_other where original_status_id = '{mid}'".format(
        mid=str(mid))

    to_crawl = True

    with db_connect.db_execute() as conn:
        r = db_connect.db_queryall(conn, select_sql)
        rc = db_connect.db_queryall(conn, child_sql)

        # 如果数据库存在源微博和它的一些转发信息,我们就认为它不必抓取了
        if len(r) > 0 and rc[0][0] > 0:
            print('关于此条微博的扩散信息已经存于数据库中')
            to_crawl = False
        else:
            insert_sql = (
                'insert into weibo_spread_original (user_id,user_screenname,user_province,user_city,'
                'user_location, user_description,user_url,user_profileimageurl,user_gender,'
                'user_followerscount,user_friendscount,user_statusescount,user_createdat,user_verifiedtype,'
                'user_verifiedreason,status_createdat,status_mid,status_source,status_repostscount,'
                'status_commentscount,status_url) values (:user_id,:user_screenname,:user_province,'
                ':user_city,:user_location,:user_description,:user_url,:user_profileimageurl,:user_gender,'
                ':user_followerscount,:user_friendscount,:user_statusescount,'
                ':user_createdat,:user_verifiedtype,:user_verifiedreason,:status_createdat,:status_mid,'
                ':status_source,:status_repostscount,:status_commentscount,:status_url)'
            )

            args = {
                'user_id':
                user.id,
                'user_screenname':
                user.screen_name,
                'user_province':
                user.province,
                'user_city':
                user.city,
                'user_location':
                user.location,
                'user_description':
                user.description.encode('gbk', 'ignore').decode('gbk'),
                'user_url':
                user.blog_url,
                'user_profileimageurl':
                user.headimg_url,
                'user_followerscount':
                user.followers_count,
                'user_friendscount':
                user.friends_count,
                'user_statusescount':
                user.status_count,
                'user_createdat':
                user.register_time,
                'user_verifiedtype':
                user.verify_type,
                'user_verifiedreason':
                user.verify_info.encode('gbk', 'ignore').decode('gbk'),
                'user_gender':
                user.gender,
                'status_createdat':
                post_time,
                'status_mid':
                mid,
                'status_source':
                source,
                'status_repostscount':
                reposts_count,
                'status_commentscount':
                comments_count,
                'status_url':
                root_url,
            }

            db_connect.db_dml_parms(conn, insert_sql, args)

    return to_crawl
def save(sos):
    ins_count = 0
    insert_sql = (
        'insert into weibo_spread_other (user_id,user_screenname,user_province,user_city,user_location,'
        'user_description,user_url,user_profileimageurl,user_gender,user_followerscount,user_friendscount,'
        'user_statusescount,user_createdat,user_verifiedtype,user_verifiedreason,status_createdat,'
        'status_mid,status_source,status_repostscount,status_commentscount,upper_user_id,'
        'original_status_id,status_url) '
        " values (:user_id,:user_screenname,:user_province,:user_city,:user_location,"
        ":user_description,:user_url,:user_profileimageurl,:user_gender,:user_followerscount,"
        ":user_friendscount,:user_statusescount,:user_createdat,:user_verifiedtype,:user_verifiedreason,"
        ":status_createdat,:status_mid,:status_source,:status_repostscount,:status_commentscount,"
        ":upper_user_id,:original_status_id,:status_url)")

    with db_connect.db_execute() as conn:

        for item in sos:
            if item.verify_type == '':
                item.verify_type = 0
            try:
                args = {
                    'user_id':
                    item.id,
                    'user_url':
                    item.blog_url,
                    'user_profileimageurl':
                    item.headimg_url,
                    'user_screenname':
                    item.screen_name.encode('gbk', 'ignore').decode('gbk'),
                    'user_province':
                    item.province.encode('gbk', 'ignore').decode('gbk'),
                    'user_city':
                    item.city.encode('gbk', 'ignore').decode('gbk'),
                    'user_location':
                    item.location.encode('gbk', 'ignore').decode('gbk'),
                    'user_description':
                    item.description.encode('gbk', 'ignore').decode('gbk'),
                    'user_gender':
                    item.gender.encode('gbk', 'ignore').decode('gbk'),
                    'user_verifiedreason':
                    item.verify_info.encode('gbk', 'ignore').decode('gbk'),
                    'status_source':
                    item.device.encode('gbk', 'ignore').decode('gbk'),
                    'user_followerscount':
                    int(item.followers_count),
                    'user_friendscount':
                    int(item.friends_count),
                    'user_statusescount':
                    int(item.status_count),
                    'status_repostscount':
                    int(item.reposts_count),
                    'status_commentscount':
                    int(item.comments_count),
                    'user_verifiedtype':
                    item.verify_type,
                    'user_createdat':
                    item.register_time,
                    'status_createdat':
                    item.status_post_time,
                    'status_mid':
                    item.mid,
                    'upper_user_id':
                    item.upper_user_id,
                    'original_status_id':
                    item.original_status_id,
                    'status_url':
                    item.status_url,
                }
                db_connect.db_dml_parms(conn, insert_sql, args)
            except Exception as why:
                storage.error(item.__dict__)
                storage.error(why)
            else:
                ins_count += 1
        storage.info('一共插入了{ins}条数据'.format(ins=ins_count))