def parser_episode_info(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    program_mongo_id = remark['program_mongo_id']

    episode_json = tools.get_json_by_requests(root_url)
    if not episode_json:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    code = episode_json.get('code')
    if code is not 200:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    episode_data = episode_json.get('data', {})
    episode_info = episode_data.get('info', {})

    name = episode_info.get('title', '')
    url = episode_info.get('url', '')
    image_url = episode_info.get('thumb', '')
    episode_num = episode_info.get('series', '')
    summary = episode_info.get('desc', '')
    time_length = episode_info.get('duration', '')

    episode_download_url = episode_data.get('stream', [{'url':''}])[0].get('url')
    episode_download_url = 'http://disp.titan.mgtv.com' + episode_download_url

    episode_download_info = tools.get_json_by_requests(episode_download_url)
    if episode_download_info:
        episode_download_url = episode_download_info.get('info', '')
    else:
        episode_download_url = ''

    log.debug('''
        program_mongo_id     %s
        name                 %s
        url                  %s
        image_url            %s
        episode_num          %s
        summary              %s
        time_length          %s
        episode_download_url %s
        '''%(program_mongo_id, name, url, image_url, episode_num, summary, time_length, episode_download_url))

    base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_mongo_id, episode_num = episode_num, time_length = time_length, episode_name = name, download_status = '', download_url = episode_download_url, episode_url = url, summary = summary, image_url = image_url, sto_path = '')
    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Esempio n. 2
0
    def update_hot_weight(self, articles):
        release_time = ''
        for article in articles:
            article_info = article.get('_source')
            if article_info['WEIGHT'] == 0:
                continue

            data = {
                'hot_id': article_info['ID'],  # 文章id
                'hot_value': article_info['HOT'],  # 热度值
                'clues_ids': article_info['CLUES_IDS'],  #相关舆情匹配到的线索id
                'article_count': article_info['ARTICLE_COUNT'],  # 文章总数
                'vip_count': article_info["VIP_COUNT"],  # 主流媒体数
                'negative_emotion_count':
                article_info["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }
            print('''
                release_time %s
                record_time  %s
                ''' %
                  (article_info["RELEASE_TIME"], article_info["RECORD_TIME"]))

            result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS,
                                                data=data)
            weight = result.get('weight', 0)  # * weight_factor 没有考虑到地域
            tools.print_one_line("修改相关度 %s -> %s" %
                                 (article_info['WEIGHT'], weight))

            if self._yqtj_es.update_by_id('tab_iopm_hot_info',
                                          article_info['ID'],
                                          {"WEIGHT": weight}):
                record_time = article_info['RECORD_TIME']

        return record_time
Esempio n. 3
0
    def update_article_weight(self, articles):
        release_time = ''
        for article in articles:
            article_info = article.get('_source')
            if article_info['WEIGHT'] == 0:
                continue

            data = {
                'article_id': article_info['ID'],  # 文章id
                'clues_ids': article_info['CLUES_IDS'],  # 线索ids
                'may_invalid': 0,  #是否可能无效(微博包含@ 或者#)
                'vip_count': article_info['IS_VIP'],  # 主流媒体数
                'negative_emotion_count': article_info['EMOTION'],  # 负面情感数
                'zero_ids': article_info['ZERO_ID']
            }
            print(article_info["TITLE"])
            print(article_info["RELEASE_TIME"])

            result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS,
                                                data=data)
            weight = result.get('weight', 0)  # * weight_factor 没有考虑到地域
            tools.print_one_line("修改相关度 %s -> %s" %
                                 (article_info['WEIGHT'], weight))

            if self._yqtj_es.update_by_id('tab_iopm_article_info',
                                          article_info['ID'],
                                          {"WEIGHT": weight}):
                release_time, record_time = article_info[
                    "RELEASE_TIME"], article_info["RECORD_TIME"]

        return release_time, record_time
Esempio n. 4
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            url = 'http://so.video.sina.com.cn/interface/s?from=video&wd=%s&s_id=w00001&p=%s&n=20&s=1' \
                  % (keyword, page_index)
            info_json = tools.get_json_by_requests(url)
            video_info_list = info_json['list']
            if not video_info_list:
                print(url)
                break
            for video_info in video_info_list:
                image_url = video_info['thumburl']
                title = tools.del_html_tag(video_info['videoname'])
                url = video_info['url']
                release_time = video_info['showtime']

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)
                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
Esempio n. 5
0
    def send_file(self, users, media_id):
        '''
        @summary:
        ---------
        @param users:
        @param media_id: 文件id,可以调用上传临时素材接口获取
        ---------
        @result:
        '''
        url = 'https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=%s' % self._send_msg_access_token
        data = {
            "touser": users,
            "toparty": "",
            "totag": "",
            "msgtype": "file",
            "agentid": self._agentid,
            "file": {
                "media_id": media_id
            },
            "safe": 0
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(url=url, headers=HEADER, data=data)
        return result
def main():
    url = 'http://192.168.60.38:8001/hotspot_al/interface/getHotAnalysis_self'
    json = tools.get_json_by_requests(url)
    # print(json)

    hot_list = []
    datas = json['data']
    for data in datas:
        clus_id = list(data.keys())[0]
        sql = 'select t.name from TAB_IOPM_CLUES t where id = ' + clus_id
        name = oracledb.find(sql)[0][0]

        hot_infos = data[clus_id]['data']
        for hot_info in hot_infos:
            kw = hot_info['kw']
            hot = hot_info['hot']
            # print(name, kw, hot)
            hot_list.append({
                'name': name,
                'kw': kw,
                'hot': hot,
                'clus_id': clus_id
            })

    hot_list.sort(key=lambda obj: obj.get('hot'), reverse=True)

    for hot_info in hot_list:
        print(hot_info['name'], hot_info['clus_id'], '--->', hot_info['kw'],
              hot_info['hot'])
Esempio n. 7
0
    def get_article(self):
        '''
        @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO
        ---------
        ---------
        @result:
        '''

        per_record_time = self.get_per_record_time()

        today_time = tools.get_current_date('%Y-%m-%d')
        if per_record_time:
            sql = "select * from {table} where record_time > '{record_time}' and release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format(
                table=self._table,
                record_time=per_record_time,
                today_time=today_time)
        else:
            sql = "select * from {table} where release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format(
                table=self._table, today_time=today_time)

        url = 'http://{address}/_sql?sql={sql}'.format(address=ADDRESS,
                                                       sql=sql)
        log.debug(url)

        article = tools.get_json_by_requests(url)
        return article.get('hits', {}).get('hits', [])
Esempio n. 8
0
    def get_download_url(url):
        html, r = tools.get_html_by_requests(url)

        tvid = re.compile('player-tvid="(\d{4,11})"').findall(str(html))
        if not tvid:
            tvid = re.compile('list-tvid="(\d{4,11})"').findall(str(html))
        for i in tvid:
            tvid = i

        album_id = ''.join(re.compile('player-albumid="(\d{4,11})"').findall(str(html)))
        if not album_id:
            album_id = ''.join(re.compile('list-albumid="(\d{4,11})"').findall(str(html)))
            if not album_id:
                album_id = ''.join(re.compile('albumId: ?(\d{4,11}),').findall(str(html)))
                if not album_id:
                    album_id = ''.join(re.compile('param\[\'albumId\'\] ?= ?"(\d{4,11})"').findall(str(html)))

        current_time = tools.get_current_timestamp() * 1000
        current_time = str(current_time)

        url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time
        json_ = tools.get_json_by_requests(url, headers=DOWNLOAD_HEADER)

        try:
            video_download_url = ''.join(re.compile('\'1\': {(.+?)},').findall(str(json_)))
            video_download_url = ''.join(re.compile('\'url\': ?\'(.+?)\'').findall(str(video_download_url)))
            video_download_url, r = tools.get_html_by_requests(video_download_url)
            video_download_url = ''.join(re.compile('"l":"(.+?)"').findall(str(video_download_url)))
        except:
            video_download_url = ''
        return video_download_url
        def export_callback(execute_type, sql, data_json):
            if execute_type != ExportData.EXCEPTION:

                # 取涉我舆情
                hot_vip_article_count, negative_emotion_count, article_count, article_clues_ids = get_about_me_message(
                    hot_info['kg'], hot_id)
                print('====================')
                # 计算权重
                url = IOPM_SERVICE_ADDRESS + '/related_sort?hot_id=%d&hot_value=%s&clues_ids=%s&article_count=%s&vip_count=%s&negative_emotion_count=%s' % (
                    hot_id, hot_info['hot'], article_clues_ids, article_count,
                    hot_vip_article_count, negative_emotion_count)
                weight = tools.get_json_by_requests(url).get('weight', 0)
                print(url)
                print('----------------------------')

                # 同步到es
                data_json['WEIGHT'] = weight
                data_json['IS_VIP'] = hot_vip_article_count
                data_json['NEGATIVE_EMOTION_COUNT'] = negative_emotion_count
                data_json['ARTICLE_COUNT'] = article_count
                data_json['ARTICLE_CLUES_IDS'] = article_clues_ids
                es.add(table='TAB_IOPM_HOT_INFO',
                       data=data_json,
                       data_id=data_json.get("ID"))

                # 更新oracle 数据库里的数据
                sql = "update tab_iopm_hot_info set is_vip = %s, weight= %s, negative_emotion_count = %s, article_count = %s, article_clues_ids = '%s' where id = %s" % (
                    hot_vip_article_count, weight, negative_emotion_count,
                    article_count, article_clues_ids, data_json["ID"])
                oracledb.update(sql)
Esempio n. 10
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    html = tools.get_json_by_requests(root_url, headers=headers)
    data_info = jsonpath.jsonpath(html, '$..video_info')
    for data in data_info:
        title = data.get('title')
        video_url = data.get('play_url')
        img_url = data.get('cover_url')
        release_time = stamp_to_date(data.get('upline_time'))

        if video_url !='':
            info_type = 1
        else:
            info_type = 2

        base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title,site_name=NAME,
                                          content='', release_time=release_time, image_url=img_url,
                                          video_url=video_url, is_out_link=1, download_image=False, is_debug=False,
                                          info_type=info_type)

    base_parser.update_url('urls', root_url, Constance.DONE)
Esempio n. 11
0
def parser_comment(content_id, wall_id, page=1):
    log.debug('正在爬取第 %s 页文章评论 content_id = %s' % (page, content_id))
    flow_comment_url = 'http://sns-comment.iqiyi.com/v2/comment/get_comments.action?contentid={content_id}&page={page}&authcookie=null&page_size=40&wallId={wall_id}&agenttype=117&t={timestamp_m}'.format(
        content_id=content_id,
        page=page,
        wall_id=wall_id,
        timestamp_m=int(tools.get_current_timestamp() * 1000))

    comment_json = tools.get_json_by_requests(flow_comment_url)
    data = comment_json.get('data', {})

    # 可作为翻页的依据
    total_count = data.get('totalCount', 0)
    count = data.get('count', 0)

    replies = data.get('replies', [])
    for reply in replies:
        reply_source = reply.get("replySource", {})
        if not deal_comment(reply_source):
            break

        if not deal_comment(reply):
            break

    else:
        if replies:
            parser_comment(content_id, wall_id, page + 1)
Esempio n. 12
0
        def end_callback():
            log.info('\n********** news end **********')
            task_status.is_doing = False

            data = {'tasks': str(tasks), 'status': 603}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 已做完!')
Esempio n. 13
0
def main():
    '''
    @summary:
    ---------
    @param :
    ---------
    @result:
    '''

    clues_json = get_clues()
    clues_count = len(clues_json['data'])

    clues_json = tools.dumps_json(clues_json)
    print(clues_json)
    # save_clues_to_file(clues_json)

    keys = 'pattek.com.cn'
    prpcrypt = Prpcrypt(keys)
    encrypt_text = prpcrypt.encrypt(clues_json)

    data = {'info': encrypt_text}

    # 同步到内网
    url = 'http://192.168.60.38:8002/datasync_al/interface/cluesConfSync?'
    json = tools.get_json_by_requests(url, data=data)
    # 记录同步行为
    result = record_sync_status(clues_count, json.get("status"),
                                json.get('message'), json.get('data'), 0)
    print(result)
    log.debug('''
        ------ 同步线索到内网 -----
        %s
        记录到数据库 %d
        ''' % (json, result))

    # 同步到外网
    url = 'http://124.205.229.232:8005/gdyq/datasync_al/interface/cluesConfSync'
    json = tools.get_json_by_requests(url, data=data)
    # 记录同步行为
    result = record_sync_status(clues_count, json.get("status"),
                                json.get('message'), json.get('data'), 1)
    log.debug('''
        ------ 同步线索到外网 -----
        %s
        记录到数据库 %d
        ''' % (json, result))
Esempio n. 14
0
        def begin_callback():
            log.info('\n********** news begin **********')
            # 更新任务状态 doing

            data = {'tasks': str(tasks), 'status': 602}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 正在做...')
Esempio n. 15
0
def get_proxies():
    api_url = "http://api.xdaili.cn/xdaili-api//privateProxy/applyStaticProxy?spiderId=fadc76e39a074860aaf837b455001f75&returnType=2&count=10"
    api_json = tools.get_json_by_requests(api_url)
    ips = jsonpath(api_json, "$..ip")
    ports = jsonpath(api_json, "$..port")
    if ips and ports:
        for ip, ports in zip(ips, ports):
            proxy = ip + ":" + ports
            yield ip, proxy
        def export_callback(execute_type, sql, data_json):
            if execute_type != ExportData.EXCEPTION:
                infoIds = data['infoIds']
                url = root_url % infoIds
                json = tools.get_json_by_requests(url, headers=HEADERS)
                articles = json['data']

                # "EMOTION": 'vint_3',
                # "ACCOUNT": null,
                # "WEIGHT": 0,
                # "TITLE": "str_title",
                # "URL": "str_url",
                # "MAY_INVALID": ,
                # "CLUES_IDS": "",
                # "WEBSITE_NAME": "str_site",
                # "KEYWORDS_COUNT": 1,
                # "HOST": "str_site",
                # "INFO_TYPE": 'int_type',
                # "COMMENT_COUNT": null,
                # "HOT_ID": "vint_%d"%hot_id,
                # "REVIEW_COUNT": null,
                # "UUID": "73ec16038e074530ff109e3cfad2594c",
                # "ID": 'vint_%d'%article_id,
                # "IS_VIP": null,
                # "IMAGE_URL": 'str_picture',
                # "KEYWORDS": "str_keywords",
                # "KEYWORD_CLUES_ID": "{"中央电视台":"88758"}",
                # "RELEASE_TIME": "date_pubtime",
                # "AUTHOR": "江门日报",
                # "CONTENT": "clob_content",
                # "RECORD_TIME": 'vdate_%s'%tools.get_current_date(),
                # "UP_COUNT": 'vint_null'

                key_map = {
                    'id': 'int_dataId',
                    'content': 'clob_content',
                    'url': 'str_url',
                    'website_name': 'str_site',
                    'image_url': 'str_picture',
                    'release_time': 'date_pubtime',
                    'keywords': 'str_keywords',
                    'emotion': 'str_emotion',
                    'host': 'str_site',
                    'title': 'str_title',
                    'info_type': 'int_type',
                    'hot_id': "vint_%d" % hot_id,
                    'record_time': 'vdate_%s' % tools.get_current_date()
                }

                export_data.export_to_oracle(
                    key_map=key_map,
                    aim_table='TAB_IOPM_ARTICLE_INFO',
                    unique_key='url',
                    datas=articles,
                    unique_key_mapping_source_key={'url': 'str_url'},
                    sync_to_es=True)
Esempio n. 17
0
 def get_tags(self):
     '''
     @summary:
     ---------
     ---------
     @result:
     '''
     url = 'https://qyapi.weixin.qq.com/cgi-bin/tag/list?access_token=' + self._send_msg_access_token
     result = tools.get_json_by_requests(url)
     tools.print(result)
Esempio n. 18
0
def parser_next_page_article(video_id, wall_id, feed_id, sns_time, url):
    article_json_url = 'http://api-t.iqiyi.com/feed/get_feeds?authcookie=&device_id=pc_web&m_device_id=a11e6ea94270eaaa0b46be30af84fc54&agenttype=118&wallId={wall_id}&feedTypes=1%2C7%2C8%2C9&count=20&top=1&hasRecomFeed=1&feedId={feed_id}&needTotal=1&notice=1&version=1&upOrDown=1&snsTime={sns_time}&_={timestamp_m}'.format(wall_id = wall_id, feed_id = feed_id, sns_time = sns_time, timestamp_m = int(tools.get_current_timestamp() * 1000))
    print(article_json_url)
    article_json = tools.get_json_by_requests(article_json_url)

    wall_id = article_json.get('data', {}).get('wallId')
    # 评论数组
    feeds = article_json.get('data', {}).get('feeds', [])
    for feed in feeds:
        article_id = feed.get('commentId')

        head_url = feed.get('icon')

        name = feed.get('name')

        release_time = feed.get('releaseDate')
        release_time = tools.timestamp_to_date(release_time)

        title = feed.get('feedTitle')

        content = feed.get('description')

        image_urls = ','.join([img.get('url') for img in feed.get('pictures', [])])#逗号分隔

        watch_count = feed.get('uvCount')

        up_count = feed.get('agreeCount')

        comment_count = feed.get('commentCount')

        log.debug('''
            id:       %s
            节目id     %s
            头像地址: %s
            名字:     %s
            发布时间: %s
            标题:     %s
            内容:     %s
            图片地址: %s
            观看量:   %s
            点赞量:   %s
            评论量:   %s
            '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count))

        if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'):
            # 解析評論
            parser_comment(article_id, wall_id)
        else:
            break
    else:
        if feeds:
            feed_id = feeds[-1].get('feedId')
            sns_time = feeds[-1].get('snsTime')
            parser_next_page_article(video_id, wall_id, feed_id, sns_time, url)
 def inner_add_url(url, keyword):
     while url:
         html_json = tools.get_json_by_requests(url)
         json_value = tools.get_json_value(html_json, 'obj.pageNumberStack')
         hasNext = tools.get_json_value(html_json, 'obj.hasNext')
         if hasNext:
             url = 'http://sj.qq.com/myapp/searchAjax.htm?kw=%s&pns=' % keyword + json_value + '&sid=0'
             base_parser.add_url('WWA_search_app_urls', SITE_ID, url)
             continue
         else:
             break
def get_article_count_msg(begin_time, end_time):

    # 查询爬取到的文章数量
    data_pool_address = 'http://192.168.60.16:9200/_sql?sql='
    sql = "SELECT count(*) FROM news_article where record_time >= '{begin_time}' and record_time <= '{end_time}'".format(
        begin_time=begin_time, end_time=end_time)
    data = tools.get_json_by_requests(data_pool_address + sql)
    total_article_count = data.get('aggregations').get('COUNT(*)').get('value')

    # 查询爬取到的新发布的文章数量
    data_pool_address = 'http://192.168.60.16:9200/_sql?sql='
    sql = "SELECT count(*) FROM news_article where release_time >= '{begin_time}' and release_time <= '{end_time}'".format(
        begin_time=begin_time, end_time=end_time)
    data = tools.get_json_by_requests(data_pool_address + sql)
    new_article_count = data.get('aggregations').get('COUNT(*)').get('value')

    # 查询入业务库的文章数量
    iopm_db_address = 'http://192.168.60.27:9200/_sql?sql='
    sql = "SELECT count(*) FROM tab_iopm_article_info where  INFO_TYPE = 1 and RECORD_TIME >= '{begin_time}' and RECORD_TIME <= '{end_time}'".format(
        begin_time=begin_time, end_time=end_time)
    data = tools.get_json_by_requests(iopm_db_address + sql)
    iopm_total_article_count = data.get('aggregations').get('COUNT(*)').get(
        'value')

    # 查询入业务库的新发布的文章数量
    iopm_db_address = 'http://192.168.60.27:9200/_sql?sql='
    sql = "SELECT count(*) FROM tab_iopm_article_info where INFO_TYPE = 1 and RELEASE_TIME >= '{begin_time}' and RELEASE_TIME <= '{end_time}'".format(
        begin_time=begin_time, end_time=end_time)
    data = tools.get_json_by_requests(iopm_db_address + sql)
    iopm_new_article_count = data.get('aggregations').get('COUNT(*)').get(
        'value')

    article_count_msg = '''
        \r共抓取到有效文章数量:%s
        \r共抓取到新发布文章数量:%s
        \r去重后入业务库文章总量: %s
        \r去重后入业务库新发布的文章数量:%s
    ''' % (total_article_count, new_article_count, iopm_total_article_count,
           iopm_new_article_count)

    return article_count_msg
Esempio n. 21
0
def get_proxies():
    api_url = "http://api.xdaili.cn/xdaili-api//privateProxy/applyStaticProxy?spiderId=afadc76e39a074860aaf837b455001f75&returnType=2&count=10"
    api_json = tools.get_json_by_requests(api_url)
    ips = jsonpath(api_json, "$..ip")
    ports = jsonpath(api_json, "$..port")
    if ips and ports:
        ips_list = []
        for ip, ports in zip(ips, ports):
            proxy = ip + ":" + ports
            ip_info = {'ip': ip, 'proxy': proxy}
            ips_list.append(ip_info)
        return ips_list
Esempio n. 22
0
 def getdownload(episode_download_url_json):
     episode_json = tools.get_json_by_requests(episode_download_url_json)
     #print(episode_download_url_json)
     episode_download_url = tools.get_json_value(episode_json,
                                                 'msgs.playurl.domain')
     episode_download_url = episode_download_url and episode_download_url[
         0] or ''
     #print('-----',episode_download_url)
     episode_download_url_definition = tools.get_json_value(
         episode_json, 'msgs.playurl.dispatch.1080p')
     episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[
         0] or ''
     print(episode_download_url_definition, '*********')
     episode_download_url = episode_download_url + episode_download_url_definition
     episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format(
         random.random(), '1080p')
     episode_download_url_json = tools.get_json_by_requests(
         episode_download_url)
     episode_download_url = tools.get_json_value(episode_download_url_json,
                                                 'location')
     return episode_download_url
Esempio n. 23
0
    def add_department(self, name):
        url = 'https://qyapi.weixin.qq.com/cgi-bin/department/create?access_token=%s' % self._sync_user_access_token
        data = {
            "name": name,
            "parentid": 1,
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(
            url, headers=HEADER,
            data=data)  # {'errcode': 0, 'id': 4, 'errmsg': 'created'}
        return result.get("id")
Esempio n. 24
0
    def get_news_article(self):
        news_record_time = self._get_per_record_time()
        if news_record_time:
            sql = 'select * from news_article where record_time > {record_time} order by record_time'.format(record_time = news_record_time)
        else:
            sql = 'select * from news_article order by record_time limit 1'

        url = 'http://{address}/_sql?sql={sql}'.format(address = ADDRESS, sql = sql)
        print(url)

        news = tools.get_json_by_requests(url)
        return news.get('hits', {}).get('hits', [])
Esempio n. 25
0
def main():
    while True:
        if task_status.is_doing:
            log.debug('正在做 不取任务')
            tools.delay_time(SEARCH_TASK_SLEEP_TIME)
            continue

        task_status.is_doing = True

        # 查找任务
        get_task_url = MASTER_ADDRESS + '/task/get_task'
        print(get_task_url)
        update_task_url = MASTER_ADDRESS + '/task/update_task'
        data = tools.get_json_by_requests(get_task_url)
        # tasks = [[209690, '百度新闻', 11, 'http://news.baidu.com/?tn=news',  3]]
        print(data)
        tasks = data.get('tasks', [])
        parser_count = data.get('thread_count')

        def begin_callback():
            log.info('\n********** news begin **********')
            # 更新任务状态 doing

            data = {'tasks': str(tasks), 'status': 602}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 正在做...')

        def end_callback():
            log.info('\n********** news end **********')
            task_status.is_doing = False

            data = {'tasks': str(tasks), 'status': 603}

            if tools.get_json_by_requests(update_task_url, data=data):
                log.debug('更新任务状态 已做完!')

        # 配置spider
        spider = Spider(tab_urls='news:news_urls',
                        parser_count=parser_count,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        parser_params=tasks,
                        delete_tab_urls=False)

        # 添加parser
        spider.add_parser(news_parser)

        spider.start()
    def get_biz(self, account_id='', account=''):
        '''
        @summary: 获取公众号的__biz参数
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''

        keyword = account_id or account  # 账号id优先
        log.debug('search keywords ' + keyword)
        __biz = ''

        url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz'
        params = {
            "count": "5",
            "begin": "0",
            "action": "search_biz",
            "lang": "zh_CN",
            "random": str(random.random()) + str(random.randint(1, 9)),
            "ajax": "1",
            "token": TOOKEN,
            "f": "json",
            "query": keyword
        }

        account_json = tools.get_json_by_requests(url,
                                                  params=params,
                                                  headers=HEADERS)

        #TOOKEN过期 返回 {'base_resp': {'ret': 200003, 'err_msg': 'invalid session'}}
        account_list = account_json.get("list", [])
        for account_info in account_list:
            if account_info.get('nickname').lower() == keyword.lower(
            ) or account_info.get('alias').lower() == keyword.lower():
                __biz = account_info.get('fakeid', '')
                break

        log.debug('''
            公众号名称          %s
            公众号账号          %s
            __biz               %s
            ''' % (account, account_id, __biz))

        return __biz
def parser_episode_detail_url(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    program_mongo_id = remark['program_mongo_id']

    episode_json = tools.get_json_by_requests(root_url)
    if not episode_json:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    code = episode_json.get('code')
    if code is not 200:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    episode_data = episode_json.get('data', {})
    # 解析分集详细信息地址
    episode_list = episode_data.get('list', [])
    for episode in episode_list:
        episode_id = episode['video_id']
        episode_detail_url = 'http://pcweb.api.mgtv.com/player/video?video_id=' + episode_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 2, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    # 解析其他年份和月份的url
    episode_years = episode_data.get('tab_y', [])
    episode_months = episode_data.get('tab_m', [])
    for episode_year in episode_years: #
        year = episode_year['t']
        temp_program_id = episode_year['id']
        episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s'%temp_program_id
        # 添加url 没月份参数默认是最近月份的数据
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : temp_program_id})

    for episode_month in episode_months[1:]: #去掉最近月份的数据
        episode_month = episode_month['m']
        episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s&month=%s'%(program_id, episode_month)
        # 添加url
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Esempio n. 28
0
    def __invite_user(self, user_id):
        '''
        @summary: 邀请成员
        ---------
        @param user_id:
        ---------
        @result:
        '''

        url = 'https://qyapi.weixin.qq.com/cgi-bin/batch/invite?access_token=' + self._sync_user_access_token
        data = {
            "user": [user_id],
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(url, headers=HEADER, data=data)
        return result
    def is_have_new_article(self, __biz):
        '''
        @summary: 检查公众号今日是否发文
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''

        log.debug('search keywords ' + __biz)

        url = 'https://mp.weixin.qq.com/cgi-bin/appmsg'
        params = {
            "lang": "zh_CN",
            "token": TOOKEN,
            "query": "",
            "f": "json",
            "count": "5",
            "action": "list_ex",
            "ajax": "1",
            "type": "9",
            "fakeid": __biz,
            "random": str(random.random()) + str(random.randint(1, 9)),
            "begin": "0"
        }

        articles_json = tools.get_json_by_requests(url,
                                                   params=params,
                                                   headers=HEADERS)
        # print(articles_json)

        # TOOLEN 过期 返回 {'base_resp': {'err_msg': 'invalid csrf token', 'ret': 200040}}
        article_list = articles_json.get('app_msg_list', [])
        for article in article_list:
            release_time = article.get('update_time')
            release_time = tools.timestamp_to_date(release_time)
            log.debug("最近发文时间 %s" % release_time)

            if release_time >= tools.get_current_date('%Y-%m-%d'):
                return constance.UPDATE
            else:
                return constance.NOT_UPDATE
        else:
            return constance.ERROR
Esempio n. 30
0
    def update_user(self,
                    user_id,
                    user_name='',
                    mobile='',
                    email='',
                    enable=1):
        url = 'https://qyapi.weixin.qq.com/cgi-bin/user/update?access_token=' + self._sync_user_access_token
        data = {
            "userid": user_id,
            "name": user_name,
            "mobile": mobile,
            "email": email,
            "enable": 1,
        }

        data = tools.dumps_json(data).encode('utf-8')
        result = tools.get_json_by_requests(url, headers=HEADER, data=data)
        return result