Ejemplo n.º 1
0
def down_dedao():
    global problem_list
    mysql = MySQL()

    try:
        mysql.get_connection()

        select_results = mysql.select('article',
                                      ['article_id', 'article_content'],
                                      'avatar_uploaded = 0')

        print(len(select_results))
        problem_list = []
        # type:"audio , audio:"
        # type:"image , src:"
        for result in select_results:
            resource_list = []
            # 文章id
            id = result[0]

            # 追加作者头像信息
            article_author = mysql.select('article_author',
                                          ['article_id', 'author_id'],
                                          'article_id = ' + str(id))
            author_info = mysql.select(
                'author', ['author_name', 'author_avatar'],
                'author_id = ' + str(article_author[0][1]))
            resource_list.append(author_info[0][1])

            # 查封面
            ext_attributes = mysql.select(
                'ext_attribute',
                ['article_id', 'attribute_name', 'attribute_value'],
                'article_id = ' + str(id))
            for ext_attribute in ext_attributes:
                # 评论头像追加
                if ext_attribute[1] == 'comment':
                    comment = json.loads(ext_attribute[2])
                    for note in comment['list']:
                        resource_list.append(note['notes_owner']['avatar'])

            # 文章正文
            try:

                # 先创建目录
                os.mkdir(os.path.join(os.path.abspath('resource'), str(id)))
                # resource_path_list = []
                # 对当前文章的链接一个一个进行下载,构造请求头(分析一下)
                for resource in resource_list:
                    # 下载音频
                    # if '.m4a' in resource or '.mp4' in resource:
                    audio_parse = urlparse(resource)
                    if '.m4a' in resource or '.mp4' in resource:
                        audio_headers['Host'] = audio_parse[1]
                        audio = requests.get(resource, headers=audio_headers)
                    else:
                        image_headers['Host'] = audio_parse[1]
                        audio = requests.get(resource, headers=image_headers)

                    # 解析出资源名
                    audio_name = audio_parse[2][audio_parse[2].rfind('/') + 1:]
                    # 把文章资源放到resource/id/...
                    with open(
                            os.path.join(os.path.abspath('resource'), str(id),
                                         audio_name), 'wb') as f:
                        f.write(audio.content)
                    print(
                        '资源下载成功:%s' %
                        os.path.join(os.path.abspath('resource'), audio_name))
                    time.sleep(2)

            except:
                problem_list.append(id)

    finally:
        print(problem_list)
        mysql.close_connection()
Ejemplo n.º 2
0
            print('文件 %s 上传成功:%s' % (path, DOMAIN + key))
            return DOMAIN + key

        except Exception as e:
            print(e)
            print('文件 %s 再次上传,上传次数为 %s' % (path, current_count))
            return upload_file(path, current_count + 1, max_retry_count)

    else:
        print('文件 %s 上传次数超过 %s ,上传失败!' % (path, max_retry_count))
        # 上传3次都没有上传成功就抛出异常
        raise Exception('文件 %s 上传次数超过 %s ,上传失败!' % (path, max_retry_count))


if __name__ == '__main__':
    mysql = MySQL()
    mysql.get_connection()
    try:
        print(os.path.abspath('resource'))
        dir_list = os.listdir(os.path.abspath('resource'))
        # 每个dir就是一个文章的所有资源
        for dir in dir_list:
            try:
                dir_path = os.path.join(os.path.abspath('resource'), dir)
                file_list = os.listdir(dir_path)
                for file in file_list:
                    file_path = os.path.join(dir_path, file)
                    # 上传,如果上传失败就重复试3次
                    upload_file(file_path)
                    # time.sleep(1)
                # 该文章upload字段更新,更新为已上传成功
Ejemplo n.º 3
0
from handle_mysql import MySQL

mysql = MySQL()
try:

    mysql.get_connection()
    mysql.insert('tb_column', )

    # column = mysql.select('tb_column', ['column_id', 'current_article_num'], 'column_name="%s"' % '万维钢·精英日课³')
    # # print(result)
    #
    # crawled_article = []
    #
    # # 如果数据库中有
    # if column:
    #     column_id = column[0][0]
    #     current_article_num = column[0][1]
    #     print(column_id, current_article_num)
    #
    #
    #     def f(x):
    #         return x[0]
    #
    #
    #     crawled_article_id = mysql.select('article_column', ['article_id'], 'column_id="%s"' % column_id)
    #     crawled_article_ids = list(map(f, crawled_article_id))
    #     print(crawled_article_ids)
    #     print(len(crawled_article_ids), current_article_num)
    #     # if len(crawled_article_ids) < current_article_num:
    #     for article_id in crawled_article_ids:
    #         article_name = mysql.select('article', ['article_name'], 'article_id="%s"' % article_id)
Ejemplo n.º 4
0
def check_num(column_id):
    '''
    检测指定column_id的文章是否缺失
    :param column_id:
    :return:
    '''
    mysql = MySQL()
    try:
        mysql.get_connection()
        ids = mysql.select('article_column', ['article_id'],
                           'column_id=%s' % column_id)
        print(ids)
        print(len(ids))
        time.sleep(3)

        def f(x):
            return x[0]

        ids = list(map(f, ids))
        print(ids)

        # 找到第一篇的id
        for id in ids:
            r = mysql.select(
                'ext_attribute', ['attribute_value'],
                "attribute_name = 'prev_article_id' and attribute_value = '0' and article_id = %s"
                % id)
            if r:
                first_id = id
                break

        print(first_id)
        time.sleep(3)
        token = True
        for i in range(400):
            # 查找当前id的下一篇id
            r = mysql.select(
                'ext_attribute', ['attribute_value'],
                "attribute_name = 'next_article_id' and article_id = %s" %
                first_id)
            if not r:
                break
            if int(r[0][0]) == 0:
                token = False
                break
            # 检查下一篇id的文章是否存在
            r2 = mysql.select('article', ['article_id'],
                              "article_id = %s" % r[0][0])
            # 如果不存在
            if not r2:
                break
            first_id = r[0][0]

        # 这个first_id的下一篇就没有
        if token:
            print('这篇id为:%s的下一篇文章不存在!' % first_id)
        else:
            print('没有不存在的文章!')

        problem_list = []

        # for result in select_results:
        #     try:
        #         contents = json.loads(result[1])
        #     except:
        #         problem_list.append(result[0])
        # if problem_list:
        #     print('出现问题的文章id:', problem_list)
        # else:
        #     print('没有文章出现问题')
    finally:
        mysql.close_connection()
Ejemplo n.º 5
0
import os
from down_upload_resource.download_file import down_dedao
from down_upload_resource.upload_file import upload_file
from down_upload_resource.delete_dir import clear_dir
from handle_mysql import MySQL
import time

# 目前一下子下载20条数据,到resource文件夹中
down_dedao(522897, 50)

# 进行上传
mysql = MySQL()
mysql.get_connection()
try:
    print(os.path.abspath('resource'))
    dir_list = os.listdir(os.path.abspath('resource'))
    # 每个dir就是一个文章的所有资源
    for dir in dir_list:
        try:
            dir_path = os.path.join(os.path.abspath('resource'), dir)
            file_list = os.listdir(dir_path)
            for file in file_list:
                file_path = os.path.join(dir_path, file)
                # 上传,如果上传失败就重复试3次
                upload_file(file_path)
                # time.sleep(1)
            # 该文章upload字段更新,更新为已上传成功
            mysql.update('article', 'uploaded = 1', 'article_id = "%s"' % dir)
        except Exception as e:
            print(e)
Ejemplo n.º 6
0
                reset_columns(driver)
                break


# 重置column列表
def reset_columns(driver):
    # 重置列表
    driver.back()
    # 点击课程
    if wait.until(lambda x: x.find_element_by_xpath(
            "//android.widget.TextView[@resource-id='com.luojilab.player:id/tv_course']"
    )):
        driver.find_element_by_xpath(
            "//android.widget.TextView[@resource-id='com.luojilab.player:id/tv_course']"
        ).click()

    # 点击课程里面的最新购买
    if wait.until(lambda x: x.find_element_by_id(
            "com.luojilab.player:id/nearBuyBtn")):
        driver.find_element_by_id("com.luojilab.player:id/nearBuyBtn").click()


if __name__ == '__main__':

    mysql = MySQL()
    try:
        mysql.get_connection()
        handle_dedao(driver)
    finally:
        mysql.close_connection()
Ejemplo n.º 7
0
def crawl_article(driver):
    l = get_size(driver)
    x1 = int(l[0] * 0.5)
    y1 = int(l[1] * 0.75)
    y2 = int(l[1] * 0.25)

    n = 0
    while True:
        temp = driver.page_source
        driver.swipe(x1, y1, x1, y2)
        time.sleep(0.2)
        # if temp == driver.page_source and '点击加载留言' in driver.page_source:
        if temp == driver.page_source:
            if n > 3:
                break
            n += 1





if __name__ == '__main__':
    mysql = MySQL()
    try:
        # mysql.get_connection()
        handle_dedao(driver)
    finally:
        pass
        # mysql.close_connection()
Ejemplo n.º 8
0
def response(flow):
    # 该文章的评论
    # https://entree.igetget.com/bauhinia/v1/article/info
    if 'igetget.com/ledgers/notes/article_comment_list' in flow.request.url:

        content = json.loads(flow.response.text)

        # 如果有更多评论,就取出拼接再放入
        if content['c']['isMore']:
            if not os.path.exists('../temp_file/5.txt') or os.path.getsize('../temp_file/5.txt') == 0:
                with open('../temp_file/5.txt', 'wb') as f:
                    pickle.dump(content, f)
            else:
                print('之前大小:', os.path.getsize('../temp_file/5.txt'),)

                with open('../temp_file/5.txt', 'rb') as f:
                    temp = pickle.load(f)

                with open('../temp_file/5.txt', 'wb') as f:
                    f.truncate()

                print('中间大小:', os.path.getsize('../temp_file/5.txt'))
                temp['c']['list'] = temp['c']['list'] + content['c']['list']
                with open('../temp_file/5.txt', 'wb') as f:
                    pickle.dump(temp, f)
                print('之后大小:', os.path.getsize('../temp_file/5.txt'))

        # 如果就一条或者最后一条
        else:
            # 从文件取出,存入数据库,得到文章id,一起存入数据库
            # 如果这是最后一条
            request = flow.request
            print(request.content)
            bys = request.content
            s = '&' + bys.decode('utf-8') + '&'
            page_search = re.compile(r"&page=(.*?)&")
            page = re.search(page_search, s).groups(1)

            page_count_search = re.compile(r"&page_count=(.*?)&")
            page_count = re.search(page_count_search, s).groups(1)

            article_id_search = re.compile(r"&article_id=(.*?)&")
            article_id = re.search(article_id_search, s).groups(1)

            print(page[0],page_count[0],article_id[0])

            if os.path.getsize('../temp_file/5.txt'):
                print('最后一条')
                with open('../temp_file/5.txt', 'rb') as f:
                    temp = pickle.load(f)
                temp['c']['list'] = temp['c']['list'] + content['c']['list']
                print('一共  ',temp['c']['total'],'当前 ',len(temp['c']['list']))
                # 然后把temp解析拼装放入数据库
                with open('../temp_file/5.txt', 'wb') as f:
                    f.truncate()

            # 如果仅这一条
            else:
                # 直接存到数据库,就不用清空文件了
                print('只有一条')
                print('一共 ',content['c']['total'],'当前 ',len(content['c']['list']))
                temp = content



            ext_info = {}

            ext_info['article_id'] = int(article_id[0])

            ext_info['attribute_name'] = 'comment'

            comment = {}
            # 评论所属文章id
            comment['article_id'] = article_id[0]

            comment['per_page_count'] = page_count[0]

            # 评论列表
            comment['list'] = []

            # 评论总数量
            comment['comment_total'] = temp['c']['total']
            if 0 != int(comment['comment_total']):

                # 评论的额外信息,包括文章栏目信息,以备不时之需
                comment['extra'] = temp['c']['list'][0].get('extra')

                for a_comment in temp['c']['list']:
                    comment_dict = {}
                    comment_dict['note_id'] = a_comment.get('note_id')
                    # 评论内容
                    comment_dict['note'] = a_comment.get('note')
                    comment_dict['content'] = a_comment.get('content')
                    comment_dict['note_title'] = a_comment.get('note_title')
                    # 作者回复
                    comment_dict['note_line'] = a_comment.get('note_line')
                    # 评论时间
                    comment_dict['comment_reply_time'] = a_comment.get('comment_reply_time')
                    comment_dict['create_time'] = a_comment.get('create_time')
                    comment_dict['update_time'] = a_comment.get('update_time')

                    # 评论转发数
                    comment_dict['repost_count'] = a_comment['notes_count'].get('repost_count',0)
                    # 评论评论数
                    comment_dict['comment_count'] = a_comment['notes_count'].get('comment_count',0)
                    # 评论点赞数
                    comment_dict['like_count'] = a_comment['notes_count'].get('like_count',0)
                    comment_dict['notes_owner'] = a_comment.get('notes_owner')
                    comment['list'].append(comment_dict)
            # print(comment)
            else:
                comment['list'] = []

            # 如果大于20,就取前20个
            if len(comment['list']) > 18:
                comment['list'] = comment['list'][:18]

            ext_info['attribute_value'] = json.dumps(comment,ensure_ascii=False)

            # print(len(json.dumps(comment)))
            # print(json.dumps(comment).encode('gb2312').decode('unicode_escape'))
            # print(len(json.dumps(comment).encode('gb2312').decode('unicode_escape')))
            # print('哈哈')
            # 插入数据库
            try:
                mysql = MySQL()
                mysql.get_connection()
                # 目前判重机制是看这个article_id是否有额外属性comment,如果有就判重,但是没有比较内容是否重复,只是判断是否有这个属性
                mysql.insert('ext_attribute',ext_info)

            finally:
                mysql.close_connection()
Ejemplo n.º 9
0
def down_dedao(column_id, num):
    print(image_headers)
    print(proxies)
    global problem_list
    mysql = MySQL()

    try:
        mysql.get_connection()

        # select_results = mysql.select('article',['article_id','article_content'],'uploaded = 0 limit %s' % num)
        select_results = mysql.select(
            'article', ['article_id', 'article_content'],
            'uploaded = 0 and article_id in (SELECT article_id FROM article_column WHERE column_id = %s) limit %s'
            % (column_id, num))

        print(len(select_results))
        problem_list = []
        # type:"audio , audio:"
        # type:"image , src:"
        for result in select_results:
            resource_list = []
            file_path_list = []
            # 文章id
            id = result[0]

            # 追加作者头像信息
            article_author = mysql.select('article_author',
                                          ['article_id', 'author_id'],
                                          'article_id = ' + str(id))
            author_info = mysql.select(
                'author', ['author_name', 'author_avatar'],
                'author_id = ' + str(article_author[0][1]))
            resource_list.append(author_info[0][1])

            # 查封面
            ext_attributes = mysql.select(
                'ext_attribute',
                ['article_id', 'attribute_name', 'attribute_value'],
                'article_id = ' + str(id))
            for ext_attribute in ext_attributes:
                # 封面追加
                if ext_attribute[1] == 'cover_image':
                    resource_list.append(ext_attribute[2])
                # 评论头像追加
                if ext_attribute[1] == 'comment':
                    comment = json.loads(ext_attribute[2])
                    for note in comment['list']:
                        resource_list.append(note['notes_owner']['avatar'])

            # 文章正文
            try:
                contents = json.loads(result[1])

                for content in contents:
                    if content['type'] == 'audio':
                        resource_list.append(content['audio']['mp3_play_url'])
                    elif content['type'] == 'image' and content.get('src'):
                        resource_list.append(content.get('src'))

                # 先创建目录
                os.mkdir(os.path.join(os.path.abspath('resource'), str(id)))
                # resource_path_list = []
                # 对当前文章的链接一个一个进行下载,构造请求头(分析一下)
                for resource in resource_list:
                    # 下载音频
                    # if '.m4a' in resource or '.mp4' in resource:
                    audio_parse = urlparse(resource)
                    if '.m4a' in resource or '.mp4' in resource:
                        audio_headers['Host'] = audio_parse[1]
                        # audio = requests.get(resource, headers=audio_headers, proxies=proxies)
                        audio = requests.get(resource, headers=audio_headers)
                    else:
                        image_headers['Host'] = audio_parse[1]
                        # audio = requests.get(resource, headers=image_headers, proxies=proxies)
                        audio = requests.get(resource, headers=image_headers)

                    # 解析出资源名
                    audio_name = audio_parse[2][audio_parse[2].rfind('/') + 1:]
                    # 把文章资源放到resource/id/...
                    with open(
                            os.path.join(os.path.abspath('resource'), str(id),
                                         audio_name), 'wb') as f:
                        f.write(audio.content)
                        file_path_list.append(
                            os.path.join(os.path.abspath('resource'),
                                         audio_name))
                    print(
                        '资源下载成功:%s' %
                        os.path.join(os.path.abspath('resource'), audio_name))
                    time.sleep(1)

            except:
                problem_list.append(id)

    finally:
        print(problem_list)
        mysql.close_connection()
Ejemplo n.º 10
0
    def response(self, flow: mitmproxy.http.HTTPFlow):
        #显示所有栏目列表发送的请求
        #这里要防止把之前的清空
        # if 'entree.igetget.com/purchased/v2/product/allList' in flow.request.url:
        if 'igetget.com/purchased/v2/product/allList' in flow.request.url:
            self.num = self.num + 1
            ctx.log.info("We've seen %d flows" % self.num)

            #得到请求,看是否的第一次请求
            request = flow.request
            bys = request.content
            s = '&' + bys.decode('utf-8') + '&'
            page_search = re.compile(r"&page=(.*?)&")
            page = re.search(page_search, s).groups(1)

            print('第几次:', page[0])
            #如果是第一次请求。刷新1.txt文件
            if page[0] == '1':
                print('------------------')
                if os.path.exists('temp_file/1.txt'):
                    #移除旧的
                    os.remove('temp_file/1.txt')
                #创建新的
                with open('temp_file/1.txt', 'wb') as f:
                    pass

            content = json.loads(flow.response.text)

            _1_info_before = {}
            #这里要追加column,不然刷新后读不到数据了

            #得到请求参数,如果page = 1,就说明是第一次请求,清除1.txt(如果存在就清除),并创建新的1.txt
            #必须先创建1.txt
            with open('temp_file/1.txt', 'rb') as f:
                #如果不为空,就读
                if os.path.getsize('temp_file/1.txt'):
                    print('no null')
                    #读到数据,重新组装为json
                    _1_info_before = pickle.load(f)

            if _1_info_before:
                _1_info_before['c']['list'] = _1_info_before['c'][
                    'list'] + content['c']['list']

            # 这里最好改成redis缓存的方式
            with open('temp_file/1.txt', 'wb') as f:
                # r如果里面有数据,就_1_info_before
                if os.path.getsize('temp_file/1.txt'):
                    # 清除文件中所有数据
                    # 必须要清空,不然gg
                    f.truncate()
                else:
                    pickle.dump(content, f)

            #如果不为
            if page[0] != '1':
                with open('temp_file/1.txt', 'wb') as f:
                    if os.path.getsize('temp_file/1.txt') == 0:
                        pickle.dump(_1_info_before, f)

        # 点击某个栏目后发送的请求1
        # if 'entree.igetget.com/bauhinia/v1/class/purchase/info' in flow.request.url:
        if 'igetget.com/bauhinia/v1/class/purchase/info' in flow.request.url:
            content = json.loads(flow.response.text)

            with open('temp_file/1.txt', 'rb') as f:
                _1_info = pickle.load(f)

            column_info = {}
            author_info = {}
            source_info = {}
            #拼装当前点击文章的栏目信息
            for alist in _1_info['c']['list']:
                #目前只爬取文章结构为40的文章
                #尽量多用content里的变量
                #column的id竟然不是唯一的,用类型进一步限制下,不然会把其他栏目的信息,放到错位的栏目上
                #用column_id和content_category先唯一限定column试试
                # print('-----------'* 10)
                # print(alist['id'])
                # print(content['c']['class_info']['product_id'])
                # print(alist['category'])
                # print(alist['type'])
                # print(content['c']['class_info']['product_type'])
                # print(content['c']['class_info']['name'])
                # print(alist['title'])
                # print('-----------' * 10)

                if alist['id'] == content['c']['class_info'][
                        'product_id'] and alist['category'] == 40 and alist[
                            'type'] == content['c']['class_info'][
                                'product_type'] and content['c']['class_info'][
                                    'name'] == alist['title']:
                    #栏目
                    #根据栏目名称生成栏目id
                    column_info['column_id'] = generate_id(
                        content['c']['class_info']['name'])
                    column_info['column_name'] = content['c']['class_info'][
                        'name']
                    column_info['column_info'] = content['c']['items'][1][
                        'content']
                    column_info['column_learn_num'] = content['c'][
                        'class_info']['learn_user_count']
                    # 课程总数
                    column_info['article_num'] = content['c']['class_info'][
                        'phase_num']
                    column_info['current_article_num'] = content['c'][
                        'class_info']['current_article_count']
                    column_info['finished'] = 0 if content['c']['class_info'][
                        'phase_num'] - content['c']['class_info'][
                            'current_article_count'] > 0 else 1
                    column_info['crawl_time'] = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime())

                    # 作者
                    author_info['author_id'] = generate_id(
                        content['c']['class_info']['lecturer_name'])
                    author_info['author_name'] = content['c']['class_info'][
                        'lecturer_name']
                    author_info['author_avatar'] = content['c']['class_info'][
                        'lecturer_avatar']
                    author_info['author_info'] = content['c']['items'][0][
                        'content']

                    # 来源
                    source_info['source_name'] = 'https://www.igetget.com/'
                    source_info['source_info'] = ''
                    source_info['source_id'] = generate_id(
                        source_info['source_name'])

                    break

            mysql = MySQL()
            mysql.get_connection()

            try:

                if column_info:
                    print(json.dumps(column_info))
                    # 把category==40的栏目信息存储到数据库
                    mysql.insert('tb_column', column_info)

                if author_info:
                    print(json.dumps(author_info))
                    # 把作者信息存到数据库中
                    mysql.insert('author', author_info)

                if source_info:
                    print(json.dumps(source_info))
                    # 来源信息存入数据库
                    mysql.insert('source', source_info)
            except Exception as e:
                print(e)
            finally:
                mysql.close_connection()

            with open('temp_file/2.txt', 'wb') as f:
                pickle.dump(content, f)

        #点击栏目中的文章后发送的请求1
        #https://entree.igetget.com/bauhinia/v1/article/info
        if 'igetget.com/bauhinia/v1/article/info' in flow.request.url:
            content = json.loads(flow.response.text)

            with open('temp_file/4.txt', 'wb') as f:
                pickle.dump(content, f)

        # 当我们请求的url包含以下字符串的时候就做对应的操作,对正文进行解析
        #有该文章标题,封面,正文,
        #还需要添加文章属于哪个栏目,属于该栏目的哪章
        if 'igetget.com/ddarticle/v1/article/get' in flow.request.url:
            content = json.loads(flow.response.text)

            article_info = {}
            ext_info = {}
            article_author_info = {}
            article_column_info = {}
            article_source_info = {}
            article_category_info = {}

            with open('temp_file/4.txt', 'rb') as f:
                _4_info = pickle.load(f)

            mysql = MySQL()
            mysql.get_connection()
            try:
                if _4_info and _4_info['c']['dd_article_id'] == content[
                        'data']['article']['Id']:
                    #栏目信息嵌入
                    # article_info['column_id'] = generate_id(_4_info['c']['class_info']['name'])
                    # article_info['column_name'] = _4_info['c']['class_info']['name']
                    # article_info['class_id'] = _4_info['c']['article_info']['class_id']

                    # 文章id
                    article_info['article_id'] = _4_info['c']['article_info'][
                        'id']
                    article_info['article_name'] = _4_info['c'][
                        'article_info']['title']
                    # 正文是字符串,先进行loads
                    sub_content = json.loads(content['data']['content'])
                    # 正文解析,转换为字符串存储,并进行编码转换
                    temp = handle_dedao_dict(
                        get_content_list(sub_content, _4_info))
                    article_info['article_content'] = temp

                    # 显示文章信息
                    print('article_info: ' + json.dumps(article_info))

                    # return None
                    # print('hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh')

                    #插入文章表
                    mysql.insert('article', article_info)

                    #关联文章和作者
                    article_author_info['article_id'] = article_info[
                        'article_id']
                    article_author_info['author_id'] = generate_id(
                        _4_info['c']['class_info']['lecturer_name'])
                    print('article_author_info: ' +
                          json.dumps(article_author_info))
                    mysql.insert('article_author', article_author_info)

                    #关联文章和栏目
                    article_column_info['article_id'] = article_info[
                        'article_id']
                    article_column_info['column_id'] = generate_id(
                        _4_info['c']['class_info']['name'])
                    print('article_column_info: ' +
                          json.dumps(article_column_info))
                    mysql.insert('article_column', article_column_info)

                    # 关联文章和来源
                    article_source_info['article_id'] = article_info[
                        'article_id']
                    article_source_info['source_id'] = generate_id(
                        'https://www.igetget.com/')
                    print('article_source_info: ' +
                          json.dumps(article_source_info))
                    mysql.insert('article_source', article_source_info)

                    #额外信息
                    ext_info['article_id'] = article_info['article_id']
                    ext_info['attribute_name'] = 'prev_article_id'
                    ext_info['attribute_value'] = _4_info['c'][
                        'prev_article_id']
                    #插入额外属性表
                    print('ext_info: ' + json.dumps(ext_info))
                    mysql.insert('ext_attribute', ext_info)

                    ext_info['attribute_name'] = 'next_article_id'
                    ext_info['attribute_value'] = _4_info['c'][
                        'next_article_id']
                    #插入额外属性表
                    print('ext_info: ' + json.dumps(ext_info))
                    mysql.insert('ext_attribute', ext_info)

                    ext_info['attribute_name'] = 'cover_image'
                    ext_info['attribute_value'] = _4_info['c']['article_info'][
                        'logo']
                    # 插入额外属性表
                    print('ext_info: ' + json.dumps(ext_info))
                    mysql.insert('ext_attribute', ext_info)

                    ext_info['attribute_name'] = 'article_learn_count'
                    ext_info['attribute_value'] = _4_info['c']['article_info'][
                        'cur_learn_count']
                    # 插入额外属性表
                    print('ext_info: ' + json.dumps(ext_info))
                    mysql.insert('ext_attribute', ext_info)

                    ext_info['attribute_name'] = 'audio_url'
                    ext_info['attribute_value'] = _4_info['c']['article_info'][
                        'audio']['mp3_play_url']
                    # 插入额外属性表
                    print('ext_info: ' + json.dumps(ext_info))
                    mysql.insert('ext_attribute', ext_info)

                    # article_info['article_learn_count'] = _4_info['c']['article_info']['cur_learn_count']
                    # article_info['audio_url'] = _4_info['c']['article_info']['audio']['mp3_play_url']

                    #得到文章 章节名
                    with open('temp_file/2.txt', 'rb') as f:
                        _2_info = pickle.load(f)

                    if _4_info['c']['class_info']['name'] == _2_info['c'][
                            'class_info']['name'] and _4_info['c'][
                                'article_info']['class_id'] == _2_info['c'][
                                    'class_info']['id'] and _2_info['c'][
                                        'class_info']['has_chapter'] == 1:
                        for adict in _2_info['c']['chapter_list']:
                            if _4_info['c']['article_info'][
                                    'chapter_id'] == adict['id']:

                                ext_info['attribute_name'] = 'chapter_name'
                                ext_info['attribute_value'] = adict['name']
                                #插入额外信息表
                                print('ext_info: ' + json.dumps(ext_info))
                                mysql.insert('ext_attribute', ext_info)

                    # article_info['create_time'] = content['data']['article']['CreateTime']
                    # article_info['update_time'] = content['data']['article']['UpdateTime']
                    # article_info['publish_time'] = content['data']['article']['PublishTime']

            except Exception as e:
                print('文章链接entree.igetget.com/ddarticle/v1/article/get 出现异常')
                print(e)
            finally:
                mysql.close_connection()

        # 该文章的评论
        # https://entree.igetget.com/bauhinia/v1/article/info
        if 'igetget.com/ledgers/notes/article_comment_list' in flow.request.url:

            content = json.loads(flow.response.text)

            # 如果有更多评论,就取出拼接再放入
            if content['c']['isMore']:
                if not os.path.exists('temp_file/5.txt') or os.path.getsize(
                        'temp_file/5.txt') == 0:
                    with open('temp_file/5.txt', 'wb') as f:
                        pickle.dump(content, f)
                else:
                    print(
                        '之前大小:',
                        os.path.getsize('temp_file/5.txt'),
                    )

                    with open('temp_file/5.txt', 'rb') as f:
                        temp = pickle.load(f)

                    with open('temp_file/5.txt', 'wb') as f:
                        f.truncate()

                    print('中间大小:', os.path.getsize('temp_file/5.txt'))
                    temp['c'][
                        'list'] = temp['c']['list'] + content['c']['list']
                    with open('temp_file/5.txt', 'wb') as f:
                        pickle.dump(temp, f)
                    print('之后大小:', os.path.getsize('temp_file/5.txt'))

            # 如果就一条或者最后一条
            else:
                # 从文件取出,存入数据库,得到文章id,一起存入数据库
                # 如果这是最后一条
                request = flow.request
                print(request.content)
                bys = request.content
                s = '&' + bys.decode('utf-8') + '&'
                page_search = re.compile(r"&page=(.*?)&")
                page = re.search(page_search, s).groups(1)

                page_count_search = re.compile(r"&page_count=(.*?)&")
                page_count = re.search(page_count_search, s).groups(1)

                article_id_search = re.compile(r"&detail_id=(.*?)&")
                # article_id_search = re.compile(r"&article_id=(.*?)&")
                article_id = re.search(article_id_search, s).groups(1)

                print(page[0], page_count[0], article_id[0])

                if os.path.getsize('temp_file/5.txt'):
                    print('最后一条')
                    with open('temp_file/5.txt', 'rb') as f:
                        temp = pickle.load(f)
                    temp['c'][
                        'list'] = temp['c']['list'] + content['c']['list']
                    print('一共  ', temp['c']['total'], '当前 ',
                          len(temp['c']['list']))
                    # 然后把temp解析拼装放入数据库
                    with open('temp_file/5.txt', 'wb') as f:
                        f.truncate()

                # 如果仅这一条
                else:
                    # 直接存到数据库,就不用清空文件了
                    print('只有一条')
                    print('一共 ', content['c']['total'], '当前 ',
                          len(content['c']['list']))
                    temp = content

                ext_info = {}

                ext_info['article_id'] = int(article_id[0])

                ext_info['attribute_name'] = 'comment'

                comment = {}
                # 评论所属文章id
                comment['article_id'] = article_id[0]

                comment['per_page_count'] = page_count[0]

                # 评论列表
                comment['list'] = []

                # 评论总数量
                comment['comment_total'] = temp['c']['total']
                if 0 != int(comment['comment_total']):

                    # 评论的额外信息,包括文章栏目信息,以备不时之需
                    comment['extra'] = temp['c']['list'][0].get('extra')

                    for a_comment in temp['c']['list']:
                        comment_dict = {}
                        comment_dict['note_id'] = a_comment.get('note_id')
                        # 评论内容
                        comment_dict['note'] = a_comment.get('note')
                        comment_dict['content'] = a_comment.get('content')
                        comment_dict['note_title'] = a_comment.get(
                            'note_title')
                        # 作者回复
                        comment_dict['note_line'] = a_comment.get('note_line')
                        # 评论时间
                        comment_dict['comment_reply_time'] = a_comment.get(
                            'comment_reply_time')
                        comment_dict['create_time'] = a_comment.get(
                            'create_time')
                        comment_dict['update_time'] = a_comment.get(
                            'update_time')

                        # 评论转发数
                        comment_dict['repost_count'] = a_comment[
                            'notes_count'].get('repost_count', 0)
                        # 评论评论数
                        comment_dict['comment_count'] = a_comment[
                            'notes_count'].get('comment_count', 0)
                        # 评论点赞数
                        comment_dict['like_count'] = a_comment[
                            'notes_count'].get('like_count', 0)
                        comment_dict['notes_owner'] = a_comment.get(
                            'notes_owner')
                        comment['list'].append(comment_dict)
                # print(comment)
                else:
                    comment['list'] = []

                # 如果大于18,就取前18个
                if len(comment['list']) > 50:
                    comment['list'] = comment['list'][:50]

                ext_info['attribute_value'] = json.dumps(comment,
                                                         ensure_ascii=False)

                # print(len(json.dumps(comment)))
                # print(json.dumps(comment).encode('gb2312').decode('unicode_escape'))
                # print(len(json.dumps(comment).encode('gb2312').decode('unicode_escape')))
                # print('哈哈')
                # 插入数据库
                mysql = MySQL()
                try:
                    mysql.get_connection()
                    # 目前判重机制是看这个article_id是否有额外属性comment,如果有就判重,但是没有比较内容是否重复,只是判断是否有这个属性
                    mysql.insert('ext_attribute', ext_info)

                finally:
                    mysql.close_connection()
Ejemplo n.º 11
0
# -*- coding: utf-8 -*-
from handle_mysql import MySQL
import json


'''
检测文章中是否有不合法的文章,并生成文章id
'''
mysql = MySQL()
try:
    mysql.get_connection()

    select_results = mysql.select('article', ['article_id', 'article_content'], 'uploaded = 0')

    problem_list = []

    # type:"audio , audio:"
    # type:"image , src:"
    for result in select_results:
        try:
            contents = json.loads(result[1])
        except:
            problem_list.append(result[0])


    if problem_list:

        print('出现问题的文章id:', problem_list)
    else:
        print('没有文章出现问题')
finally: