def crawler_restart(): r = get_redis() r.publish('crawler_signals', 'restart') sub = r.pubsub() sub.subscribe(['crawler_signals_feedback']) flag = 0 while True: if flag > 120: return jsonify({'is_success': False}) msg = sub.get_message() if msg: if msg.get('data') == 'restarted': return jsonify({'is_success': True}) time.sleep(1) flag += 1
def tmt_wechat_import(): items = get_collection('items') fields = request.get_json(force=True) if not fields: return jsonify({'is_success': False, 'error_msg': 'ARGS ERROR'}) sources_map = [ u'腾讯科技', u'科技每日推送', u'科技最前线', u'财新TMT', u'网易科技', u'新浪科技', u'凤凰科技', u'慧聪TMT', u'蓝媒TMT', u'蓝鲸TMT', u'阿玻罗金融科技', u'TMT每日观察', u'朱劲松-TMT观察', u'TMT观察', u'杨吉TMT', u'搜狐科技', u'雷锋网', u'36氪', u'虎嗅网', u'21世纪经济报道', u'创业邦杂志', u'中国经营报', u'经济观察报', u'铅笔道', u'财新网', u'并购汪', u'亿欧网', u'新智元', u'猎云网', u'机器之心', u'海外情报社', u'FT中文网', u'界面', u'雷帝触网', u'好奇心日报', u'商业周刊中文版', u'环球老虎财经', u'钛媒体', u'PingWest品玩', u'速途网', u'第一财经', u'秦朔朋友圈', u'IT桔子', u'DoNews', u'动点科技', u'全球企业动态', u'蓝鲸财经网', u'财经天下周刊', u'VRAR创投圈', u'蓝鲸财经记者工作平台', u'B楼12座', u'经纬创投', u'小道消息', u'VR时代', u'财经女记者部落', u'真格基金', u'峰瑞资本', u'keso怎么看', u'一见', ] filtered_data = {} if fields.get('wechatName') in sources_map: inserted = items.insert_one({ "content": fields.get('content', ''), "source": u'{}__{}'.format(fields.get('wechatId', ''), fields.get('originId', '')), "host": 'http://mp.weixin.qq.com/', "link": fields.get('sourceUrl', ''), "time": fields.get('createdAt', ''), "name": u"微信 - %s" % fields.get('wechatName'), "title": fields.get('title', ''), "time_human": "", "ctime": int(time.time()), "coops": ["tmt"], }) redis = get_redis() rt = redis.publish('news_updated', str(inserted.inserted_id)) filtered_data = { 'db_inserted_id': str(inserted.inserted_id), 'redis_published_return': rt } return jsonify({'is_success': True, 'filtered_data': filtered_data})
def app_push_import(): import_items = get_collection('app_push_items') items = get_collection('items') fields = request.get_json(force=True) auth_key = fields.get('auth_key') if not auth_key == '16b4af8': return jsonify({'is_success': False, 'error_msg': 'AUTH ERROR'}) del fields['auth_key'] if not fields: return jsonify({'is_success': False, 'error_msg': 'ARGS ERROR'}) unique_title = fields.get('title', '') + fields.get( 'packageName', 'N') + fields.get('contentText', '') uid = hashlib.md5(unique_title.encode('utf-8')).hexdigest() fields.update({'uid': uid}) result = import_items.update_one({'uid': uid}, {'$set': fields}, upsert=True) if result.matched_count > 0: return jsonify({'is_success': False, 'error_msg': 'DUPLICATED ERROR'}) sources_map = [ { 'title': u'新华社', 'packageName': "net.xinhuamm.mainclient", 'titleField': 'contentText', 'bodyField': '', 'website': 'http://www.xinhuanet.com/', }, { 'title': u'人民日报', 'packageName': "com.peopledailychina.activity", 'titleField': 'title', 'bodyField': 'contentText', 'website': 'http://www.people.com.cn/', }, { 'title': u'二十一财经', 'packageName': "com.twentyfirstcbh.epaper", 'titleField': 'contentText', 'bodyField': '', 'website': 'http://www.21jingji.com/', }, ] filtered_data = {} for source in sources_map: if fields.get('packageName') == source.get('packageName'): mname = 'APP_%s' % fields.get('packageName') inserted = items.insert_one({ "content": fields.get(source.get('bodyField')) if source.get('bodyField') else '', "source": mname, "host": fields.get('packageName'), "link": source.get('website'), "time": fields.get('notifyTime'), "name": u"%s - APP推送" % source.get('title'), "title": fields.get(source.get('titleField')), "time_human": "", "ctime": int(time.time()) }) redis = get_redis() rt = redis.publish('news_updated', str(inserted.inserted_id)) filtered_data = { 'db_inserted_id': str(inserted.inserted_id), 'redis_published_return': rt } break return jsonify({'is_success': True, 'filtered_data': filtered_data})
def crawler_start(): r = get_redis() r.publish('crawler_signals', 'start') return jsonify({'is_success': True})
def crawler_reload(): r = get_redis() r.publish('crawler_signals', 'reload') return jsonify({'is_success': True})