Ejemplo n.º 1
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    offset = remark.get('offset')

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True)
    if not headers:
        base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)

    for header in headers:
        # 查看更多相关新闻
        regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻'
        more_news_url = tools.get_info(str(header), regex, fetch_one = True)
        if more_news_url:
            more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url)
            more_news_url = more_news_url.replace('amp;', '')
            base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0})

        url = header.h3.a['href']
        article_extractor = ArticleExtractor(url)
        content = title = release_time = author = website_domain =''
        content = article_extractor.get_content()
        if content:
            title = article_extractor.get_title()
            release_time = article_extractor.get_release_time()
            author = article_extractor.get_author()
            website_domain = tools.get_domain(url)
            uuid = tools.get_uuid(title, website_domain)
            website_name = ''
            website_position = None

            log.debug('''
                uuid         %s
                title        %s
                author       %s
                release_time %s
                domain       %s
                url          %s
                content      %s
                '''%(uuid, title, author, release_time, website_domain, url, '...'))

            # 入库
            if tools.is_have_chinese(content):
                is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content)

                if not is_continue:
                    break
    else:
        # 循环正常结束 该页均正常入库, 继续爬取下页
        offset += 50
        url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset)
        base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset})

    base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
Ejemplo n.º 2
0
def save_article_dynamic(data):
    log.debug(tools.dumps_json(data))

    sql = tools.make_insert_sql('wechat_article_dynamic',
                                data,
                                insert_ignore=True)
    db.add(sql)
Ejemplo n.º 3
0
    def __init__(self, ip_ports=IP_PORTS, db=DB, user_pass=USER_PASS):
        # super(RedisDB, self).__init__()

        if not hasattr(self, '_redis'):
            self._is_redis_cluster = False

            try:
                if len(ip_ports) > 1:
                    startup_nodes = []
                    for ip_port in ip_ports:
                        ip, port = ip_port.split(':')
                        startup_nodes.append({"host": ip, "port": port})

                    self._redis = StrictRedisCluster(
                        startup_nodes=startup_nodes, decode_responses=True)
                    self._pipe = self._redis.pipeline(transaction=False)

                    self._is_redis_cluster = True

                else:
                    ip, port = ip_ports[0].split(':')
                    self._redis = redis.Redis(
                        host=ip,
                        port=port,
                        db=db,
                        password=user_pass,
                        decode_responses=True)  # redis默认端口是6379
                    self._pipe = self._redis.pipeline(
                        transaction=True
                    )  # redis-py默认在执行每次请求都会创建(连接池申请连接)和断开(归还连接池)一次连接操作,如果想要在一次请求中指定多个命令,则可以使用pipline实现一次请求指定多个命令,并且默认情况下一次pipline 是原子性操作。

            except Exception as e:
                raise
            else:
                log.info('连接到redis数据库 %s' % (tools.dumps_json(ip_ports)))
Ejemplo n.º 4
0
    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        WechatService._es.add('wechat_account', account_info,
                              account_info.get('__biz'))
Ejemplo n.º 5
0
    def deal_request(self, name):
        web.header('Content-Type', 'text/html;charset=UTF-8')

        data = json.loads(json.dumps(web.input()))
        print(name)
        print(data)

        if name == 'get_task':
            tasks = self.task_service.get_task()
            return tools.dumps_json(tasks)

        elif name == 'update_task':
            tasks = eval(data.get('tasks', []))
            status = data.get('status')
            self.task_service.update_task_status(tasks, status)

            return tools.dumps_json('{"status":1}')
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
def parser_program(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    regex = '<li class="v-item-v5.*?">(.*?)</li>'
    video_blocks = tools.get_info(html, regex)
    for video_block in video_blocks:
        regex = '<a class="u-video" href="(.*?)"'
        program_url = tools.get_info(video_block, regex, fetch_one = True)
        program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')]
        program_url = 'http://www.mgtv.com/h/%s.html'%program_id

        regex = '<img class="u-image" src="(.*?)"'
        image_url = tools.get_info(video_block, regex, fetch_one = True)

        regex = 'em class="u-time">(.*?)</em>'
        episode = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<a class="u-title".*?>(.*?)</a>'
        title = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<span class="u-desc">(.*?)</span>'
        actors_block = tools.get_info(video_block, regex, fetch_one = True)
        regex = '<a .*?>(.*?)</a?'
        actors = tools.get_info(actors_block, regex)
        actors = '/'.join(actors) if actors else '暂无'

        detail_html, r = tools.get_html_by_requests(program_url)
        regex = '<em class="label">简介.*?<span>(.*?)</span>'
        summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else ''

        log.debug('''
            program_url %s
            image_url   %s
            episode     %s
            title       %s
            actors      %s
            summary     %s
            '''%(program_url, image_url, episode, title, actors, summary))

        program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '')

        # 获取集信息url  没月份参数默认是最近月份的数据
        episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    fit_url = tools.fit_url(urls, FIT_URLS)
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<div id="content">(.*?)<div class="clear"></div>',
              '<div class="article">(.*?)<!--文章操作-->',
              '<div id="video_area">(.*?)<!--文章操作-->',
              '<div class="content">(.*?)<div id="article_edit">'
              ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             '''%(depth+1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id, source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)
Ejemplo n.º 9
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']

    # 获取搜索词比配到的url
    start = 0
    while True:
        urls = mg.search_url(query=root_url,
                             num=50,
                             start=start,
                             pause=random.randint(MIN_SLEEP_TIME,
                                                  MAX_SLEEP_TIME))
        if not urls:
            break

        for url in urls:
            url = url.replace('amp;', '')

            article_extractor = ArticleExtractor(url)
            content = title = release_time = author = website_domain = ''
            content = article_extractor.get_content()
            if content:
                title = article_extractor.get_title()
                release_time = article_extractor.get_release_time()
                author = article_extractor.get_author()
                website_domain = tools.get_domain(url)
                uuid = tools.get_uuid(title, website_domain)
                website_name = ''
                website_position = 35  # 境外

                log.debug('''
                    uuid         %s
                    title        %s
                    author       %s
                    release_time %s
                    domain       %s
                    url          %s
                    content      %s
                    ''' % (uuid, title, author, release_time, website_domain,
                           url, '...'))

                # 入库
                if tools.is_have_chinese(content):
                    is_continue = self_base_parser.add_news_acticle(
                        uuid, title, author, release_time, website_name,
                        website_domain, website_position, url, content)

                    if not is_continue:
                        break
        else:
            # 循环正常结束 该页均正常入库, 继续爬取下页
            start += 50

    base_parser.update_url('google_news_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    title = '<tr height="25"><td><a href=".*?"  title="(.*?)"'
    video_url = ['<tr height="25"><td><a href="(.*?)"']
    author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>']
    watched_count = ['浏览次数: </span>(.*?)&nbsp']
    file_size = ['资料大小: </span>(.*?)&nbsp']
    download_count = ['下载次数: </span>(.*?)&nbsp']

    titles = tools.get_info(html, title, allow_repeat = True)
    video_urls = tools.get_info(html, video_url, allow_repeat = True)
    authors = tools.get_info(html, author, allow_repeat = True)
    watched_counts = tools.get_info(html, watched_count, allow_repeat = True)
    file_sizes = tools.get_info(html, file_size, allow_repeat= True)
    download_counts = tools.get_info(html, download_count, allow_repeat = True)


    for i in range(len(titles)):
        title = titles[i]
        title = tools.del_html_tag(title)

        video_url = video_urls[i]
        video_url = tools.get_full_url('http://www.sobaidupan.com', video_url)

        author = authors[i]
        watched_count = watched_counts[i]
        file_size = file_sizes[i]
        download_count = download_counts[i]

        log.debug('''
            标题:    %s
            视频地址: %s
            作者:    %s
            观看数    %s
            资料大小  %s
            下载次数  %s
        '''%(title, video_url, author, watched_count, file_size, download_count))

        contained_key, contained_key_count = base_parser.get_contained_key(title, '',
                                                            remark['search_keyword1'],
                                                            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size,
                                     file_name = title, author = author, watched_count = watched_count,
                                     download_count = download_count, search_type = search_type,
                                     keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id'])

    base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html, STOP_URLS)

    urls = tools.fit_url(urls, "cctv.com")
    for url in urls:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->']

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)
Ejemplo n.º 12
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    if depth == 0:
        parser_video_info(root_url, depth, site_id, remark)
def main():
    '''
    @summary:
    ---------
    @param :
    ---------
    @result:
    '''

    clues_json = get_clues()
    clues_json = tools.dumps_json(clues_json)
Ejemplo n.º 14
0
    def add_department(self, name):
        url = 'https://qyapi.weixin.qq.com/cgi-bin/department/create?access_token=%s' % self._sync_user_access_token
        data = {
            "name": name,
            "parentid": 1,
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(
            url, headers=HEADER,
            data=data)  # {'errcode': 0, 'id': 4, 'errmsg': 'created'}
        return result.get("id")
Ejemplo n.º 15
0
    def get_article(self):
        '''
        @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO
        ---------
        ---------
        @result:
        '''

        per_record_time = self.get_per_record_time()

        if per_record_time:
            body = {
                "size": 200,
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "record_time": {
                                    "gt": per_record_time
                                }
                            }
                        }
                    }
                },
                "sort": [{
                    "record_time": "asc"
                }]
            }

        else:
            body = {
                # "query": {
                #     "filtered": {
                #       "filter": {
                #         "range": {
                #            "release_time" : {
                #                 "gte": today_time + ' 00:00:00', # 今日
                #                 "lte": today_time + ' 23:59:59' # 今日
                #             }
                #         }
                #       }
                #     }
                # },
                "size": 200,
                "sort": [{
                    "record_time": "asc"
                }]
            }

        log.debug(self._table + " => " + tools.dumps_json(body))

        article = self._data_pool_es.search(self._table, body)
        return article.get('hits', {}).get('hits', [])
def parser_episode_info(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    program_mongo_id = remark['program_mongo_id']

    episode_json = tools.get_json_by_requests(root_url)
    if not episode_json:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    code = episode_json.get('code')
    if code is not 200:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    episode_data = episode_json.get('data', {})
    episode_info = episode_data.get('info', {})

    name = episode_info.get('title', '')
    url = episode_info.get('url', '')
    image_url = episode_info.get('thumb', '')
    episode_num = episode_info.get('series', '')
    summary = episode_info.get('desc', '')
    time_length = episode_info.get('duration', '')

    episode_download_url = episode_data.get('stream', [{'url':''}])[0].get('url')
    episode_download_url = 'http://disp.titan.mgtv.com' + episode_download_url

    episode_download_info = tools.get_json_by_requests(episode_download_url)
    if episode_download_info:
        episode_download_url = episode_download_info.get('info', '')
    else:
        episode_download_url = ''

    log.debug('''
        program_mongo_id     %s
        name                 %s
        url                  %s
        image_url            %s
        episode_num          %s
        summary              %s
        time_length          %s
        episode_download_url %s
        '''%(program_mongo_id, name, url, image_url, episode_num, summary, time_length, episode_download_url))

    base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_mongo_id, episode_num = episode_num, time_length = time_length, episode_name = name, download_status = '', download_url = episode_download_url, episode_url = url, summary = summary, image_url = image_url, sto_path = '')
    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Ejemplo n.º 17
0
    def deal_request(self, name):
        web.header('Content-Type','text/html;charset=UTF-8')

        data = json.loads(json.dumps(web.input()))
        client_ip = web.ctx.ip

        if name == 'get_task':
            tasks = self.task_service.get_task()
            tasks = tools.dumps_json(tasks)
            log.info('''
                客户端 ip: %s
                取任务   : %s'''%(client_ip, tasks))

            return tasks

        elif name == 'update_task':
            tasks = eval(data.get('tasks', []))
            status = data.get('status')
            self.task_service.update_task_status(tasks, status)

            return tools.dumps_json('{"status":1}')
Ejemplo n.º 18
0
def parser_url_info(url_info):
    log.info('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    remark = url_info['remark']
    website_name = remark['website_name']
    website_position = remark['website_position']
    website_url = remark['website_url']
    website_domain = remark['website_domain']
    spider_depth = remark['spider_depth']

    return root_url, depth, remark, website_name, website_position, website_url, website_domain, spider_depth
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    if depth == 0:
        parser_content_info(url_info)
    elif depth == 1:
        pass
Ejemplo n.º 20
0
    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            %s''' % tools.dumps_json(article_info))

        self._es.add('wechat_article', article_info,
                     article_info.get('article_id'))
Ejemplo n.º 21
0
def save_article_list(datas: list):
    log.debug(tools.dumps_json(datas))

    sql, articles = tools.make_batch_sql('wechat_article_list', datas)
    db.add_batch(sql, articles)

    # 存文章任务
    article_task = [{
        "sn": article.get('sn'),
        "article_url": article.get('url'),
        "__biz": article.get('__biz')
    } for article in datas]

    sql, article_task = tools.make_batch_sql('wechat_article_task',
                                             article_task)
    db.add_batch(sql, article_task)
Ejemplo n.º 22
0
def save_baidu_info(release_time='',
                    content='',
                    url='',
                    author='',
                    title='',
                    is_debug=False):
    domain = tools.get_domain(url)
    content_info = {
        'domain': domain,
        'title': title,
        'author': author,
        'url': url,
        'content': content,
        'release_time': release_time,
    }
    log.debug(tools.dumps_json(content_info))
Ejemplo n.º 23
0
def main():
    db = OracleDB()
    sql = 'select t.id clues_id,to_char(t.keyword1),to_char(t.keyword2),to_char(t.keyword3),t.zero_id  from TAB_IOPM_CLUES t'
    results = db.find(sql)

    clues_json = {
        "message":
        "查询成功",
        "status":
        1,
        "data": [{
            "clues_id": 104,
            "keyword1": "",
            "keyword2": "",
            "keyword3": "",
            "zero_id": 2
        }]
    }

    clues_json['data'] = []

    for result in results:
        data = {
            "clues_id":
            result[0] if result[0] else "",
            "keyword1":
            "%s" %
            (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1]
             == ',' else result[1].replace('"', '')) if result[1] else "",
            "keyword2":
            "%s" %
            (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1]
             == ',' else result[2].replace('"', '')) if result[2] else "",
            "keyword3":
            "%s" %
            (result[3].replace('"', '“').replace('、', '')[:-1] if result[3][-1]
             == ',' else result[3].replace('"', '')) if result[3] else "",
            "zero_id":
            result[4] if result[4] else ""
        }
        clues_json["data"].append(data)

    clues_json = tools.dumps_json(clues_json)
    print(clues_json)

    tools.write_file('./clues.txt', clues_json)
Ejemplo n.º 24
0
    def deal_request(self):
        web.header('Content-Type', 'text/html;charset=UTF-8')
        print(str(web.input()))
        data = json.loads(json.dumps(web.input()))

        # 文章信息
        article_id = data.get('article_id')
        may_invalid = data.get('may_invalid') or 0

        # 热点信息
        hot_id = data.get('hot_id')
        hot_value = data.get('hot_value') or 0

        # 通用参数
        clues_ids = data.get('clues_ids') or ''
        article_count = data.get('article_count') or 0
        vip_count = data.get('vip_count') or 0
        negative_emotion_count = data.get('negative_emotion_count') or 0
        zero_ids = data.get('zero_ids') or ''

        status = 0  # 0 处理失败 1 处理成功
        weight = -1

        try:
            if hot_id:
                status, weight = RelatedSortAction._related_sort_service.deal_hot(
                    hot_id, float(hot_value), clues_ids, zero_ids,
                    int(article_count), int(vip_count),
                    int(negative_emotion_count))

            elif article_id:
                status, weight = RelatedSortAction._related_sort_service.deal_article(
                    article_id, clues_ids, zero_ids, int(may_invalid),
                    int(vip_count), int(negative_emotion_count))

        except Exception as e:
            log.error(e)

        result = {
            "status": 1 if status else 0,
            "message": "处理成功" if status else "处理失败",
            "id": hot_id or article_id,
            "weight": weight
        }

        return tools.dumps_json(result)
Ejemplo n.º 25
0
    def __invite_user(self, user_id):
        '''
        @summary: 邀请成员
        ---------
        @param user_id:
        ---------
        @result:
        '''

        url = 'https://qyapi.weixin.qq.com/cgi-bin/batch/invite?access_token=' + self._sync_user_access_token
        data = {
            "user": [user_id],
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(url, headers=HEADER, data=data)
        return result
Ejemplo n.º 26
0
def main():
    '''
    @summary:
    ---------
    @param :
    ---------
    @result:
    '''

    clues_json = get_clues()
    clues_count = len(clues_json['data'])

    clues_json = tools.dumps_json(clues_json)
    print(clues_json)
    # save_clues_to_file(clues_json)

    keys = 'pattek.com.cn'
    prpcrypt = Prpcrypt(keys)
    encrypt_text = prpcrypt.encrypt(clues_json)

    data = {'info': encrypt_text}

    # 同步到内网
    url = 'http://192.168.60.38:8002/datasync_al/interface/cluesConfSync?'
    json = tools.get_json_by_requests(url, data=data)
    # 记录同步行为
    result = record_sync_status(clues_count, json.get("status"),
                                json.get('message'), json.get('data'), 0)
    print(result)
    log.debug('''
        ------ 同步线索到内网 -----
        %s
        记录到数据库 %d
        ''' % (json, result))

    # 同步到外网
    url = 'http://124.205.229.232:8005/gdyq/datasync_al/interface/cluesConfSync'
    json = tools.get_json_by_requests(url, data=data)
    # 记录同步行为
    result = record_sync_status(clues_count, json.get("status"),
                                json.get('message'), json.get('data'), 1)
    log.debug('''
        ------ 同步线索到外网 -----
        %s
        记录到数据库 %d
        ''' % (json, result))
def parser_episode_detail_url(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    program_mongo_id = remark['program_mongo_id']

    episode_json = tools.get_json_by_requests(root_url)
    if not episode_json:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    code = episode_json.get('code')
    if code is not 200:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    episode_data = episode_json.get('data', {})
    # 解析分集详细信息地址
    episode_list = episode_data.get('list', [])
    for episode in episode_list:
        episode_id = episode['video_id']
        episode_detail_url = 'http://pcweb.api.mgtv.com/player/video?video_id=' + episode_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 2, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    # 解析其他年份和月份的url
    episode_years = episode_data.get('tab_y', [])
    episode_months = episode_data.get('tab_m', [])
    for episode_year in episode_years: #
        year = episode_year['t']
        temp_program_id = episode_year['id']
        episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s'%temp_program_id
        # 添加url 没月份参数默认是最近月份的数据
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : temp_program_id})

    for episode_month in episode_months[1:]: #去掉最近月份的数据
        episode_month = episode_month['m']
        episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s&month=%s'%(program_id, episode_month)
        # 添加url
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Ejemplo n.º 28
0
    def update_user(self,
                    user_id,
                    user_name='',
                    mobile='',
                    email='',
                    enable=1):
        url = 'https://qyapi.weixin.qq.com/cgi-bin/user/update?access_token=' + self._sync_user_access_token
        data = {
            "userid": user_id,
            "name": user_name,
            "mobile": mobile,
            "email": email,
            "enable": 1,
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(url, headers=HEADER, data=data)
        return result
Ejemplo n.º 29
0
    def add_user(self, user_name, mobile, email='', user_id='', enable=1):
        '''
        @summary: 添加用户
        access_token 中的secret 需使用管理工具中的通讯录同步的secret
        ---------
        @param user_name:
        @param mobile:
        @param email:
        @param user_id:
        @param enable: 启用成员 0 禁用 1 启用
        ---------
        @result:
        '''
        user_id = user_id if user_id else tools.get_uuid()

        # 返回的数据格式
        return_json = {"errcode": 0, "errmsg": "created", 'user_id': user_id}

        url = 'https://qyapi.weixin.qq.com/cgi-bin/user/create?access_token=' + self._sync_user_access_token
        data = {
            "userid": user_id,
            "name": user_name,
            "mobile": mobile,
            "department": [WechatService._depertment_id],
            "email": email,
            'enable': enable,
            'to_invite':
            False  #是否邀请该成员使用企业微信(将通过微信服务通知或短信或邮件下发邀请,每天自动下发一次,最多持续3个工作日),默认值为true。
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(url, headers=HEADER, data=data)

        if result.get('errcode') == 0:
            result = self.__invite_user(user_id)

        if result.get('errcode'):
            return_json['errcode'] = result.get('errcode')
            return_json['errmsg'] = result.get('errmsg')

        return return_json
Ejemplo n.º 30
0
def save_video_info(release_time='',
                    content='',
                    url='',
                    author='',
                    title='',
                    image_url='',
                    site_name='',
                    play_count=None,
                    comment_count=None,
                    praise_count=None,
                    summary='',
                    time_length=None):
    domain = tools.get_domain(url)
    uuid = tools.get_uuid(title, domain)

    if es.get('video_news', uuid):
        log.debug(title + ' 已存在')
        return False

    content_info = {
        'domain': domain,
        'uuid': uuid,
        'site_name': site_name,
        'image_url': image_url,
        'title': title,
        'author': author,
        'url': url,
        'content': content,
        'release_time': tools.format_date(release_time),
        'play_count': play_count,
        'comment_count': comment_count,
        'praise_count': praise_count,
        'time_length': time_length,
        'record_time': tools.get_current_date(),
        'summary': summary
    }
    log.debug(tools.dumps_json(content_info))

    es.add('video_news', content_info, content_info['uuid'])
    return True