def main(): args = _parse_args() Config.set_dir(path.join(CURRENT_PATH, 'config.json')) for user_num in range(args.fake_users): channel_name = get_random_channel_name() video_name = get_random_video_name(channel_name) run(channel_name=channel_name, video_name=video_name, delay=1)
def test_get_login_access_token(): Config.set_dir(path.join(CURRENT_PATH, 'config.json')) login = Login(host=Config.instance().get('PORTAL_SERVER'), cache_path='/cache/path') access_token = login.login valid_access_token(access_token) assert login.url == path.join(login.host, 'Users/login')
def main(): Config.set_dir(path.join(CURRENT_PATH, 'config.json')) rabbitmq = RabbitMqTasks('localhost', exchange='', queue_name='online_update_comment', durable=True) while True: try: rabbitmq.consume(callback) except KeyboardInterrupt: os.system('pkill -15 python ./publish_comment_md.py') os.system('pkill -15 python ./publish_comment_redis.py') logger.warning( 'keyboard interrupt, then kill publish_comment_md.py & publish_comment_redis.py' ) break except Exception as e: os.system('pkill -15 python ./publish_comment_md.py') os.system('pkill -15 python ./publish_comment_redis.py') logger.warning( 'keyboard interrupt, then kill publish_comment_md.py & publish_comment_redis.py' ) logger.error('main exception: {}'.format(e)) break rabbitmq.close()
def test_get_video_comment(): Config.set_dir(path.join(os.getcwd(), 'src/config.json')) yt_comments = YoutubeComments(Config.instance().get('YOUTUBE_API_KEYS')[0]) video_comments_detail = yt_comments.get_video_comment('RnAXPLG_di8') assert type(video_comments_detail) == dict assert type(video_comments_detail.get('RnAXPLG_di8')) == list _valid_comment(video_comments_detail.get('RnAXPLG_di8'))
def main(): args = _parse_args() Config.set_dir(path.join(CURRENT_PATH, 'config.json')) yt_comments = YoutubeComments(args.youtube_api_key) if not args.video_id: logger.error('main fail with: video id ') video_comment_detail = yt_comments.get_video_comment(args.video_id) CommentsUnlabelData().save(TRAIN_DIR, video_comment_detail)
def _get_channels_id_code(channels_id): yt_channels = YoutubeChannel(host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR')) for channel in yt_channels: if channels_id != [] and channel.get('channelId') not in channels_id: continue yield channel.get('channelId'), channel.get('code')
def get_random_channel_name(): yt_channels = YoutubeChannel( host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR'), filter_params={"fields": { "channelName": True }}) return yt_channels[get_random_index(len(yt_channels))].get('channelName')
def get_random_video_name(channel_name): yt_videos = YoutubeVideo( host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR'), filter_params={"where": { "channelName": channel_name }}) return yt_videos[get_random_index(len(yt_videos))].get('videoName')
def _modify_video_update_times(self, video_id): yt_video = YoutubeVideo( host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR'), filter_params={"where": {"videoId": video_id}}) for video in yt_video: if video_id == video.get('videoId', ''): yt_video.patch( id=video['id'], json_data={'updateTimes': video.get('updateTimes', 0) + 1})
def _get_videos_detail(self, channel_id): return YoutubeVideo(host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR'), filter_params={ "where": { "channelId": channel_id, "updateTimes": 0 } })
def test_api_get_access_token(): Config.set_dir(path.join(CURRENT_PATH, 'config.json')) api = Api(host=Config.instance().get('PORTAL_SERVER'), target_path='Youtube_channels', cache_path=Config.instance().get('CACHE_DIR')) filter_params = {'where': {'channelId': 'UC6FcYHEm7SO1jpu5TKjNXEA'}} params = api.update_params_token(params=filter_params) assert isinstance(params, dict) assert params.get('where') == {'channelId': 'UC6FcYHEm7SO1jpu5TKjNXEA'} valid_access_token(params.get('access_token'))
def main(): args = _parse_args() Config.set_dir(path.join(CURRENT_PATH, 'config.json')) for channel_id, code in list(_get_channels_id_code(args.channels_id)): statistic_data = list(_get_commenter_statistic(code)) md_statistic_commenter = MdStatisticCommenter( cluster=args.cluster, db=args.db, collection=_get_statistic_collection(channel_id)) md_statistic_commenter.update_data(statistic_data)
def test_get_md_data(): Config.set_dir(path.join(CURRENT_PATH, 'config.json')) md = Mongodb(cluster_name='raw-comment-chinese', db_name='comment-chinese', collection_name='comment-UC6FcYHEm7SO1jpu5TKjNXEA') doc = list(md.get({'commentId': 'Ugw-4khRtnDqAAmdp1Z4AaABAg'})) assert len(doc) == 1 assert isinstance(doc[0], dict) assert 'videoId' in doc[0] assert 'author' in doc[0] assert 'text' in doc[0]
def gen(self): channels = YoutubeChannel( host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR'), filter_params={"fields": {"channelId": True}}) for channel in channels: channel_id = channel['channelId'] collection = self.gen_collection(channel_id) super(MdCommentLoader, self).__init__( cluster_name=self.cluster, db_name=self.database, collection_name=collection) logger.info('gen channel id: {} comments data'.format(channel_id)) comments_dataset = list(self.gen_comment_dataset()) comments_dataframe = pd.DataFrame(comments_dataset, columns=self.columns) self._save(comments_dataframe, channel_id)
def main(): args = _parse_args() Config.set_dir(path.join(CURRENT_PATH, 'config.json')) if not args.channels_id: yt_channels = YoutubeChannel( host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR'), filter_params={"fields": { "channelId": True }}) args.channels_id = [ channel_dict.get('channelId') for channel_dict in yt_channels ] MdCommentSentimentUpdater(cluster=args.cluster, db=args.db).update_channels( args.channels_id, args.update_all)
def main(): args = _parse_args() Config.set_dir(path.join(CURRENT_PATH, 'config.json')) mq_fanout = RabbitMqFanout(args.rabbitmq_host, args.rabbitmq_queue) redis_handler = RedisHandler(host=args.host, port=args.port, db=args.db) while True: try: mq_fanout.consume(redis_handler.callback) except KeyboardInterrupt: logger.warning('publish_comment_redis keyboard interrupt\n') break except Exception as e: logger.error('publish_comment_redis main exception: {}'.format(e)) break finally: mq_fanout.close()
def main(): args = _parse_args() Config.set_dir(path.join(CURRENT_PATH, 'config.json')) mq_fanout = RabbitMqFanout(args.rabbitmq_host, args.rabbitmq_queue) md_handler = MdHandler( cluster=args.cluster, database=args.db, collection=args.collection) while True: try: mq_fanout.consume(md_handler.callback) except KeyboardInterrupt: logger.warning('keyboard interrupt\n') break except Exception as e: logger.error('main exception: {}'.format(e)) break finally: mq_fanout.close()
def test_api_get(): TGOP_channel_data = { "id": 1, "channelName": "這群人TGOP", "channelId": "UC6FcYHEm7SO1jpu5TKjNXEA", "location": "TW", "category": "Entertainment", "language": "Chinese_Traditional", "contact": "*****@*****.**", "createdAt": "2008-06-07T00:00:00.000Z", "updateLock": None, "subscriber": None } Config.set_dir(path.join(CURRENT_PATH, 'config.json')) api = Api(host=Config.instance().get('PORTAL_SERVER'), target_path='Youtube_channels', cache_path=Config.instance().get('CACHE_DIR')) filter_params = {'where': {'channelId': 'UC6FcYHEm7SO1jpu5TKjNXEA'}} channel_data = api.get(params=filter_params) assert isinstance(channel_data, list) assert channel_data[0] == TGOP_channel_data
def main(): args = _parse_args() Config.set_dir(path.join(CURRENT_PATH, 'config.json')) youtube_api = YoutubeApi(args.youtube_api_key) channels_detail = YoutubeChannel( host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR')) videos = YoutubeVideo(host=Config.instance().get('PORTAL_SERVER'), cache_path=Config.instance().get('CACHE_DIR'), filter_params={"fields": { "videoId": True }}) video_id_series = [video['videoId'] for video in videos] logger.info('In main loading number of video id: {}'.format( len(video_id_series))) for channel in channels_detail: logger.info('gen videos by channel id {}'.format(channel['channelId'])) video_detail = youtube_api.gen_channel_video(channel['channelId'], max_result=50) if args.dry_run: return for key, detail in video_detail.items(): if not video_id_exist(key, video_id_series): try: videos.push(detail) logger.info("push data: {}".format(detail)) except Exception as e: logger.error( 'error at pushing: {}, error message: {}'.format( detail, e)) else: logger.debug("Skip due to videoId '{}' exit".format(key))
def test_get_comments_with_sentiment_score(): video_id = 'RnAXPLG_di8' test_data = defaultdict(list) test_data[video_id].append({ 'commentId': 'comment_id', 'videoId': video_id, 'authorChannelId': 'author_channel_id', 'author': 'author display name', 'text': 'text_display', 'likeCount': 0, 'publishedAt': '2019-06-27T13:49:30Z', 'updatedAt': '2019-06-27T13:49:30Z', 'replyCount': 1 }) Config.set_dir(path.join(os.getcwd(), 'src/config.json')) yt_comments = YoutubeComments( Config.instance().get('YOUTUBE_API_KEYS')[0]) result = dict( yt_comments._get_comments_with_sentiment_score(test_data)) assert result.get(video_id) is not None assert result.get(video_id)[0].get('text') is not None assert result.get(video_id)[0].get('sentimentScore') is not None
def run_update_comment(channel_id, video_id): cmd = Config.instance().get('UPDATE_VIDEO_COMMENT_CMD') + \ ' --channel-id {} --video-id {}'.format(channel_id, video_id) logger.info('online_updater run : {}'.format(cmd)) subprocess.Popen(cmd, shell=True)
def _get_commenter_statistic(code): hive_sql = '"select author, count(author) from comment_{} group by author"'.format( code) result = popen(Config.instance().get('HIVE_CMD').format(hive_sql)).read() for line in result.splitlines(): yield line.split('\t')
def main(): Config.set_dir(path.join(CURRENT_DIR, '../config.json')) MdCommentLoader( cluster='raw-comment-chinese', db='comment-chinese' ).gen()
def __init__(self): self.URL = Config.instance().get('SENTIMENT_API_URL') self._sess = None