def parser_episode_info(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] program_id = remark['program_id'] program_mongo_id = remark['program_mongo_id'] episode_json = tools.get_json_by_requests(root_url) if not episode_json: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return code = episode_json.get('code') if code is not 200: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return episode_data = episode_json.get('data', {}) episode_info = episode_data.get('info', {}) name = episode_info.get('title', '') url = episode_info.get('url', '') image_url = episode_info.get('thumb', '') episode_num = episode_info.get('series', '') summary = episode_info.get('desc', '') time_length = episode_info.get('duration', '') episode_download_url = episode_data.get('stream', [{'url':''}])[0].get('url') episode_download_url = 'http://disp.titan.mgtv.com' + episode_download_url episode_download_info = tools.get_json_by_requests(episode_download_url) if episode_download_info: episode_download_url = episode_download_info.get('info', '') else: episode_download_url = '' log.debug(''' program_mongo_id %s name %s url %s image_url %s episode_num %s summary %s time_length %s episode_download_url %s '''%(program_mongo_id, name, url, image_url, episode_num, summary, time_length, episode_download_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_mongo_id, episode_num = episode_num, time_length = time_length, episode_name = name, download_status = '', download_url = episode_download_url, episode_url = url, summary = summary, image_url = image_url, sto_path = '') base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def update_hot_weight(self, articles): release_time = '' for article in articles: article_info = article.get('_source') if article_info['WEIGHT'] == 0: continue data = { 'hot_id': article_info['ID'], # 文章id 'hot_value': article_info['HOT'], # 热度值 'clues_ids': article_info['CLUES_IDS'], #相关舆情匹配到的线索id 'article_count': article_info['ARTICLE_COUNT'], # 文章总数 'vip_count': article_info["VIP_COUNT"], # 主流媒体数 'negative_emotion_count': article_info["NEGATIVE_EMOTION_COUNT"], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } print(''' release_time %s record_time %s ''' % (article_info["RELEASE_TIME"], article_info["RECORD_TIME"])) result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS, data=data) weight = result.get('weight', 0) # * weight_factor 没有考虑到地域 tools.print_one_line("修改相关度 %s -> %s" % (article_info['WEIGHT'], weight)) if self._yqtj_es.update_by_id('tab_iopm_hot_info', article_info['ID'], {"WEIGHT": weight}): record_time = article_info['RECORD_TIME'] return record_time
def update_article_weight(self, articles): release_time = '' for article in articles: article_info = article.get('_source') if article_info['WEIGHT'] == 0: continue data = { 'article_id': article_info['ID'], # 文章id 'clues_ids': article_info['CLUES_IDS'], # 线索ids 'may_invalid': 0, #是否可能无效(微博包含@ 或者#) 'vip_count': article_info['IS_VIP'], # 主流媒体数 'negative_emotion_count': article_info['EMOTION'], # 负面情感数 'zero_ids': article_info['ZERO_ID'] } print(article_info["TITLE"]) print(article_info["RELEASE_TIME"]) result = tools.get_json_by_requests(IOPM_SERVICE_ADDRESS, data=data) weight = result.get('weight', 0) # * weight_factor 没有考虑到地域 tools.print_one_line("修改相关度 %s -> %s" % (article_info['WEIGHT'], weight)) if self._yqtj_es.update_by_id('tab_iopm_article_info', article_info['ID'], {"WEIGHT": weight}): release_time, record_time = article_info[ "RELEASE_TIME"], article_info["RECORD_TIME"] return release_time, record_time
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False for page_index in range(1, 10): url = 'http://so.video.sina.com.cn/interface/s?from=video&wd=%s&s_id=w00001&p=%s&n=20&s=1' \ % (keyword, page_index) info_json = tools.get_json_by_requests(url) video_info_list = info_json['list'] if not video_info_list: print(url) break for video_info in video_info_list: image_url = video_info['thumburl'] title = tools.del_html_tag(video_info['videoname']) url = video_info['url'] release_time = video_info['showtime'] is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def send_file(self, users, media_id): ''' @summary: --------- @param users: @param media_id: 文件id,可以调用上传临时素材接口获取 --------- @result: ''' url = 'https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=%s' % self._send_msg_access_token data = { "touser": users, "toparty": "", "totag": "", "msgtype": "file", "agentid": self._agentid, "file": { "media_id": media_id }, "safe": 0 } data = tools.dumps_json(data).encode('utf-8') result = tools.get_json_by_requests(url=url, headers=HEADER, data=data) return result
def main(): url = 'http://192.168.60.38:8001/hotspot_al/interface/getHotAnalysis_self' json = tools.get_json_by_requests(url) # print(json) hot_list = [] datas = json['data'] for data in datas: clus_id = list(data.keys())[0] sql = 'select t.name from TAB_IOPM_CLUES t where id = ' + clus_id name = oracledb.find(sql)[0][0] hot_infos = data[clus_id]['data'] for hot_info in hot_infos: kw = hot_info['kw'] hot = hot_info['hot'] # print(name, kw, hot) hot_list.append({ 'name': name, 'kw': kw, 'hot': hot, 'clus_id': clus_id }) hot_list.sort(key=lambda obj: obj.get('hot'), reverse=True) for hot_info in hot_list: print(hot_info['name'], hot_info['clus_id'], '--->', hot_info['kw'], hot_info['hot'])
def get_article(self): ''' @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO --------- --------- @result: ''' per_record_time = self.get_per_record_time() today_time = tools.get_current_date('%Y-%m-%d') if per_record_time: sql = "select * from {table} where record_time > '{record_time}' and release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format( table=self._table, record_time=per_record_time, today_time=today_time) else: sql = "select * from {table} where release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format( table=self._table, today_time=today_time) url = 'http://{address}/_sql?sql={sql}'.format(address=ADDRESS, sql=sql) log.debug(url) article = tools.get_json_by_requests(url) return article.get('hits', {}).get('hits', [])
def get_download_url(url): html, r = tools.get_html_by_requests(url) tvid = re.compile('player-tvid="(\d{4,11})"').findall(str(html)) if not tvid: tvid = re.compile('list-tvid="(\d{4,11})"').findall(str(html)) for i in tvid: tvid = i album_id = ''.join(re.compile('player-albumid="(\d{4,11})"').findall(str(html))) if not album_id: album_id = ''.join(re.compile('list-albumid="(\d{4,11})"').findall(str(html))) if not album_id: album_id = ''.join(re.compile('albumId: ?(\d{4,11}),').findall(str(html))) if not album_id: album_id = ''.join(re.compile('param\[\'albumId\'\] ?= ?"(\d{4,11})"').findall(str(html))) current_time = tools.get_current_timestamp() * 1000 current_time = str(current_time) url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time json_ = tools.get_json_by_requests(url, headers=DOWNLOAD_HEADER) try: video_download_url = ''.join(re.compile('\'1\': {(.+?)},').findall(str(json_))) video_download_url = ''.join(re.compile('\'url\': ?\'(.+?)\'').findall(str(video_download_url))) video_download_url, r = tools.get_html_by_requests(video_download_url) video_download_url = ''.join(re.compile('"l":"(.+?)"').findall(str(video_download_url))) except: video_download_url = '' return video_download_url
def export_callback(execute_type, sql, data_json): if execute_type != ExportData.EXCEPTION: # 取涉我舆情 hot_vip_article_count, negative_emotion_count, article_count, article_clues_ids = get_about_me_message( hot_info['kg'], hot_id) print('====================') # 计算权重 url = IOPM_SERVICE_ADDRESS + '/related_sort?hot_id=%d&hot_value=%s&clues_ids=%s&article_count=%s&vip_count=%s&negative_emotion_count=%s' % ( hot_id, hot_info['hot'], article_clues_ids, article_count, hot_vip_article_count, negative_emotion_count) weight = tools.get_json_by_requests(url).get('weight', 0) print(url) print('----------------------------') # 同步到es data_json['WEIGHT'] = weight data_json['IS_VIP'] = hot_vip_article_count data_json['NEGATIVE_EMOTION_COUNT'] = negative_emotion_count data_json['ARTICLE_COUNT'] = article_count data_json['ARTICLE_CLUES_IDS'] = article_clues_ids es.add(table='TAB_IOPM_HOT_INFO', data=data_json, data_id=data_json.get("ID")) # 更新oracle 数据库里的数据 sql = "update tab_iopm_hot_info set is_vip = %s, weight= %s, negative_emotion_count = %s, article_count = %s, article_clues_ids = '%s' where id = %s" % ( hot_vip_article_count, weight, negative_emotion_count, article_count, article_clues_ids, data_json["ID"]) oracledb.update(sql)
def parser(url_info): url_info['_id'] = str(url_info['_id']) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_json_by_requests(root_url, headers=headers) data_info = jsonpath.jsonpath(html, '$..video_info') for data in data_info: title = data.get('title') video_url = data.get('play_url') img_url = data.get('cover_url') release_time = stamp_to_date(data.get('upline_time')) if video_url !='': info_type = 1 else: info_type = 2 base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title,site_name=NAME, content='', release_time=release_time, image_url=img_url, video_url=video_url, is_out_link=1, download_image=False, is_debug=False, info_type=info_type) base_parser.update_url('urls', root_url, Constance.DONE)
def parser_comment(content_id, wall_id, page=1): log.debug('正在爬取第 %s 页文章评论 content_id = %s' % (page, content_id)) flow_comment_url = 'http://sns-comment.iqiyi.com/v2/comment/get_comments.action?contentid={content_id}&page={page}&authcookie=null&page_size=40&wallId={wall_id}&agenttype=117&t={timestamp_m}'.format( content_id=content_id, page=page, wall_id=wall_id, timestamp_m=int(tools.get_current_timestamp() * 1000)) comment_json = tools.get_json_by_requests(flow_comment_url) data = comment_json.get('data', {}) # 可作为翻页的依据 total_count = data.get('totalCount', 0) count = data.get('count', 0) replies = data.get('replies', []) for reply in replies: reply_source = reply.get("replySource", {}) if not deal_comment(reply_source): break if not deal_comment(reply): break else: if replies: parser_comment(content_id, wall_id, page + 1)
def end_callback(): log.info('\n********** news end **********') task_status.is_doing = False data = {'tasks': str(tasks), 'status': 603} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 已做完!')
def main(): ''' @summary: --------- @param : --------- @result: ''' clues_json = get_clues() clues_count = len(clues_json['data']) clues_json = tools.dumps_json(clues_json) print(clues_json) # save_clues_to_file(clues_json) keys = 'pattek.com.cn' prpcrypt = Prpcrypt(keys) encrypt_text = prpcrypt.encrypt(clues_json) data = {'info': encrypt_text} # 同步到内网 url = 'http://192.168.60.38:8002/datasync_al/interface/cluesConfSync?' json = tools.get_json_by_requests(url, data=data) # 记录同步行为 result = record_sync_status(clues_count, json.get("status"), json.get('message'), json.get('data'), 0) print(result) log.debug(''' ------ 同步线索到内网 ----- %s 记录到数据库 %d ''' % (json, result)) # 同步到外网 url = 'http://124.205.229.232:8005/gdyq/datasync_al/interface/cluesConfSync' json = tools.get_json_by_requests(url, data=data) # 记录同步行为 result = record_sync_status(clues_count, json.get("status"), json.get('message'), json.get('data'), 1) log.debug(''' ------ 同步线索到外网 ----- %s 记录到数据库 %d ''' % (json, result))
def begin_callback(): log.info('\n********** news begin **********') # 更新任务状态 doing data = {'tasks': str(tasks), 'status': 602} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 正在做...')
def get_proxies(): api_url = "http://api.xdaili.cn/xdaili-api//privateProxy/applyStaticProxy?spiderId=fadc76e39a074860aaf837b455001f75&returnType=2&count=10" api_json = tools.get_json_by_requests(api_url) ips = jsonpath(api_json, "$..ip") ports = jsonpath(api_json, "$..port") if ips and ports: for ip, ports in zip(ips, ports): proxy = ip + ":" + ports yield ip, proxy
def export_callback(execute_type, sql, data_json): if execute_type != ExportData.EXCEPTION: infoIds = data['infoIds'] url = root_url % infoIds json = tools.get_json_by_requests(url, headers=HEADERS) articles = json['data'] # "EMOTION": 'vint_3', # "ACCOUNT": null, # "WEIGHT": 0, # "TITLE": "str_title", # "URL": "str_url", # "MAY_INVALID": , # "CLUES_IDS": "", # "WEBSITE_NAME": "str_site", # "KEYWORDS_COUNT": 1, # "HOST": "str_site", # "INFO_TYPE": 'int_type', # "COMMENT_COUNT": null, # "HOT_ID": "vint_%d"%hot_id, # "REVIEW_COUNT": null, # "UUID": "73ec16038e074530ff109e3cfad2594c", # "ID": 'vint_%d'%article_id, # "IS_VIP": null, # "IMAGE_URL": 'str_picture', # "KEYWORDS": "str_keywords", # "KEYWORD_CLUES_ID": "{"中央电视台":"88758"}", # "RELEASE_TIME": "date_pubtime", # "AUTHOR": "江门日报", # "CONTENT": "clob_content", # "RECORD_TIME": 'vdate_%s'%tools.get_current_date(), # "UP_COUNT": 'vint_null' key_map = { 'id': 'int_dataId', 'content': 'clob_content', 'url': 'str_url', 'website_name': 'str_site', 'image_url': 'str_picture', 'release_time': 'date_pubtime', 'keywords': 'str_keywords', 'emotion': 'str_emotion', 'host': 'str_site', 'title': 'str_title', 'info_type': 'int_type', 'hot_id': "vint_%d" % hot_id, 'record_time': 'vdate_%s' % tools.get_current_date() } export_data.export_to_oracle( key_map=key_map, aim_table='TAB_IOPM_ARTICLE_INFO', unique_key='url', datas=articles, unique_key_mapping_source_key={'url': 'str_url'}, sync_to_es=True)
def get_tags(self): ''' @summary: --------- --------- @result: ''' url = 'https://qyapi.weixin.qq.com/cgi-bin/tag/list?access_token=' + self._send_msg_access_token result = tools.get_json_by_requests(url) tools.print(result)
def parser_next_page_article(video_id, wall_id, feed_id, sns_time, url): article_json_url = 'http://api-t.iqiyi.com/feed/get_feeds?authcookie=&device_id=pc_web&m_device_id=a11e6ea94270eaaa0b46be30af84fc54&agenttype=118&wallId={wall_id}&feedTypes=1%2C7%2C8%2C9&count=20&top=1&hasRecomFeed=1&feedId={feed_id}&needTotal=1¬ice=1&version=1&upOrDown=1&snsTime={sns_time}&_={timestamp_m}'.format(wall_id = wall_id, feed_id = feed_id, sns_time = sns_time, timestamp_m = int(tools.get_current_timestamp() * 1000)) print(article_json_url) article_json = tools.get_json_by_requests(article_json_url) wall_id = article_json.get('data', {}).get('wallId') # 评论数组 feeds = article_json.get('data', {}).get('feeds', []) for feed in feeds: article_id = feed.get('commentId') head_url = feed.get('icon') name = feed.get('name') release_time = feed.get('releaseDate') release_time = tools.timestamp_to_date(release_time) title = feed.get('feedTitle') content = feed.get('description') image_urls = ','.join([img.get('url') for img in feed.get('pictures', [])])#逗号分隔 watch_count = feed.get('uvCount') up_count = feed.get('agreeCount') comment_count = feed.get('commentCount') log.debug(''' id: %s 节目id %s 头像地址: %s 名字: %s 发布时间: %s 标题: %s 内容: %s 图片地址: %s 观看量: %s 点赞量: %s 评论量: %s '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count)) if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'): # 解析評論 parser_comment(article_id, wall_id) else: break else: if feeds: feed_id = feeds[-1].get('feedId') sns_time = feeds[-1].get('snsTime') parser_next_page_article(video_id, wall_id, feed_id, sns_time, url)
def inner_add_url(url, keyword): while url: html_json = tools.get_json_by_requests(url) json_value = tools.get_json_value(html_json, 'obj.pageNumberStack') hasNext = tools.get_json_value(html_json, 'obj.hasNext') if hasNext: url = 'http://sj.qq.com/myapp/searchAjax.htm?kw=%s&pns=' % keyword + json_value + '&sid=0' base_parser.add_url('WWA_search_app_urls', SITE_ID, url) continue else: break
def get_article_count_msg(begin_time, end_time): # 查询爬取到的文章数量 data_pool_address = 'http://192.168.60.16:9200/_sql?sql=' sql = "SELECT count(*) FROM news_article where record_time >= '{begin_time}' and record_time <= '{end_time}'".format( begin_time=begin_time, end_time=end_time) data = tools.get_json_by_requests(data_pool_address + sql) total_article_count = data.get('aggregations').get('COUNT(*)').get('value') # 查询爬取到的新发布的文章数量 data_pool_address = 'http://192.168.60.16:9200/_sql?sql=' sql = "SELECT count(*) FROM news_article where release_time >= '{begin_time}' and release_time <= '{end_time}'".format( begin_time=begin_time, end_time=end_time) data = tools.get_json_by_requests(data_pool_address + sql) new_article_count = data.get('aggregations').get('COUNT(*)').get('value') # 查询入业务库的文章数量 iopm_db_address = 'http://192.168.60.27:9200/_sql?sql=' sql = "SELECT count(*) FROM tab_iopm_article_info where INFO_TYPE = 1 and RECORD_TIME >= '{begin_time}' and RECORD_TIME <= '{end_time}'".format( begin_time=begin_time, end_time=end_time) data = tools.get_json_by_requests(iopm_db_address + sql) iopm_total_article_count = data.get('aggregations').get('COUNT(*)').get( 'value') # 查询入业务库的新发布的文章数量 iopm_db_address = 'http://192.168.60.27:9200/_sql?sql=' sql = "SELECT count(*) FROM tab_iopm_article_info where INFO_TYPE = 1 and RELEASE_TIME >= '{begin_time}' and RELEASE_TIME <= '{end_time}'".format( begin_time=begin_time, end_time=end_time) data = tools.get_json_by_requests(iopm_db_address + sql) iopm_new_article_count = data.get('aggregations').get('COUNT(*)').get( 'value') article_count_msg = ''' \r共抓取到有效文章数量:%s \r共抓取到新发布文章数量:%s \r去重后入业务库文章总量: %s \r去重后入业务库新发布的文章数量:%s ''' % (total_article_count, new_article_count, iopm_total_article_count, iopm_new_article_count) return article_count_msg
def get_proxies(): api_url = "http://api.xdaili.cn/xdaili-api//privateProxy/applyStaticProxy?spiderId=afadc76e39a074860aaf837b455001f75&returnType=2&count=10" api_json = tools.get_json_by_requests(api_url) ips = jsonpath(api_json, "$..ip") ports = jsonpath(api_json, "$..port") if ips and ports: ips_list = [] for ip, ports in zip(ips, ports): proxy = ip + ":" + ports ip_info = {'ip': ip, 'proxy': proxy} ips_list.append(ip_info) return ips_list
def getdownload(episode_download_url_json): episode_json = tools.get_json_by_requests(episode_download_url_json) #print(episode_download_url_json) episode_download_url = tools.get_json_value(episode_json, 'msgs.playurl.domain') episode_download_url = episode_download_url and episode_download_url[ 0] or '' #print('-----',episode_download_url) episode_download_url_definition = tools.get_json_value( episode_json, 'msgs.playurl.dispatch.1080p') episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[ 0] or '' print(episode_download_url_definition, '*********') episode_download_url = episode_download_url + episode_download_url_definition episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format( random.random(), '1080p') episode_download_url_json = tools.get_json_by_requests( episode_download_url) episode_download_url = tools.get_json_value(episode_download_url_json, 'location') return episode_download_url
def add_department(self, name): url = 'https://qyapi.weixin.qq.com/cgi-bin/department/create?access_token=%s' % self._sync_user_access_token data = { "name": name, "parentid": 1, } data = tools.dumps_json(data).encode('utf-8') result = tools.get_json_by_requests( url, headers=HEADER, data=data) # {'errcode': 0, 'id': 4, 'errmsg': 'created'} return result.get("id")
def get_news_article(self): news_record_time = self._get_per_record_time() if news_record_time: sql = 'select * from news_article where record_time > {record_time} order by record_time'.format(record_time = news_record_time) else: sql = 'select * from news_article order by record_time limit 1' url = 'http://{address}/_sql?sql={sql}'.format(address = ADDRESS, sql = sql) print(url) news = tools.get_json_by_requests(url) return news.get('hits', {}).get('hits', [])
def main(): while True: if task_status.is_doing: log.debug('正在做 不取任务') tools.delay_time(SEARCH_TASK_SLEEP_TIME) continue task_status.is_doing = True # 查找任务 get_task_url = MASTER_ADDRESS + '/task/get_task' print(get_task_url) update_task_url = MASTER_ADDRESS + '/task/update_task' data = tools.get_json_by_requests(get_task_url) # tasks = [[209690, '百度新闻', 11, 'http://news.baidu.com/?tn=news', 3]] print(data) tasks = data.get('tasks', []) parser_count = data.get('thread_count') def begin_callback(): log.info('\n********** news begin **********') # 更新任务状态 doing data = {'tasks': str(tasks), 'status': 602} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 正在做...') def end_callback(): log.info('\n********** news end **********') task_status.is_doing = False data = {'tasks': str(tasks), 'status': 603} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 已做完!') # 配置spider spider = Spider(tab_urls='news:news_urls', parser_count=parser_count, begin_callback=begin_callback, end_callback=end_callback, parser_params=tasks, delete_tab_urls=False) # 添加parser spider.add_parser(news_parser) spider.start()
def get_biz(self, account_id='', account=''): ''' @summary: 获取公众号的__biz参数 --------- @param account_id: @param account: --------- @result: ''' keyword = account_id or account # 账号id优先 log.debug('search keywords ' + keyword) __biz = '' url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz' params = { "count": "5", "begin": "0", "action": "search_biz", "lang": "zh_CN", "random": str(random.random()) + str(random.randint(1, 9)), "ajax": "1", "token": TOOKEN, "f": "json", "query": keyword } account_json = tools.get_json_by_requests(url, params=params, headers=HEADERS) #TOOKEN过期 返回 {'base_resp': {'ret': 200003, 'err_msg': 'invalid session'}} account_list = account_json.get("list", []) for account_info in account_list: if account_info.get('nickname').lower() == keyword.lower( ) or account_info.get('alias').lower() == keyword.lower(): __biz = account_info.get('fakeid', '') break log.debug(''' 公众号名称 %s 公众号账号 %s __biz %s ''' % (account, account_id, __biz)) return __biz
def parser_episode_detail_url(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] program_id = remark['program_id'] program_mongo_id = remark['program_mongo_id'] episode_json = tools.get_json_by_requests(root_url) if not episode_json: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return code = episode_json.get('code') if code is not 200: base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) return episode_data = episode_json.get('data', {}) # 解析分集详细信息地址 episode_list = episode_data.get('list', []) for episode in episode_list: episode_id = episode['video_id'] episode_detail_url = 'http://pcweb.api.mgtv.com/player/video?video_id=' + episode_id base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 2, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) # 解析其他年份和月份的url episode_years = episode_data.get('tab_y', []) episode_months = episode_data.get('tab_m', []) for episode_year in episode_years: # year = episode_year['t'] temp_program_id = episode_year['id'] episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s'%temp_program_id # 添加url 没月份参数默认是最近月份的数据 base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : temp_program_id}) for episode_month in episode_months[1:]: #去掉最近月份的数据 episode_month = episode_month['m'] episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s&month=%s'%(program_id, episode_month) # 添加url base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def __invite_user(self, user_id): ''' @summary: 邀请成员 --------- @param user_id: --------- @result: ''' url = 'https://qyapi.weixin.qq.com/cgi-bin/batch/invite?access_token=' + self._sync_user_access_token data = { "user": [user_id], } data = tools.dumps_json(data).encode('utf-8') result = tools.get_json_by_requests(url, headers=HEADER, data=data) return result
def is_have_new_article(self, __biz): ''' @summary: 检查公众号今日是否发文 --------- @param account_id: @param account: --------- @result: ''' log.debug('search keywords ' + __biz) url = 'https://mp.weixin.qq.com/cgi-bin/appmsg' params = { "lang": "zh_CN", "token": TOOKEN, "query": "", "f": "json", "count": "5", "action": "list_ex", "ajax": "1", "type": "9", "fakeid": __biz, "random": str(random.random()) + str(random.randint(1, 9)), "begin": "0" } articles_json = tools.get_json_by_requests(url, params=params, headers=HEADERS) # print(articles_json) # TOOLEN 过期 返回 {'base_resp': {'err_msg': 'invalid csrf token', 'ret': 200040}} article_list = articles_json.get('app_msg_list', []) for article in article_list: release_time = article.get('update_time') release_time = tools.timestamp_to_date(release_time) log.debug("最近发文时间 %s" % release_time) if release_time >= tools.get_current_date('%Y-%m-%d'): return constance.UPDATE else: return constance.NOT_UPDATE else: return constance.ERROR
def update_user(self, user_id, user_name='', mobile='', email='', enable=1): url = 'https://qyapi.weixin.qq.com/cgi-bin/user/update?access_token=' + self._sync_user_access_token data = { "userid": user_id, "name": user_name, "mobile": mobile, "email": email, "enable": 1, } data = tools.dumps_json(data).encode('utf-8') result = tools.get_json_by_requests(url, headers=HEADER, data=data) return result