def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='S3 Bucket with configuration', required=True) parser.add_argument( '--creation_date', help='If not specified, creation_date will be the day before yesterday' ) args = parser.parse_args() config = read_dict_from_s3_url(url=args.config) logger = AthenaLogger(app_name="youtube_related_video", s3_bucket=config['aws']['s3-admin'], athena_db=config['aws']['athena-admin']) try: youtube_related_video = YoutubeRelatedVideo( credentials=config['youtube'], athena_data=config['aws']['athena-data'], s3_admin=config['aws']['s3-admin'], s3_data=config['aws']['s3-data']) youtube_related_video.collect_related_video( region_code=config['parameter']['region_code'], creation_date=args.creation_date) finally: logger.save_to_s3()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='S3 Bucket with configuration', required=True) parser.add_argument( '--creation_date', help='If specified, script will validate URLs for given date; ' 'otherwise, it will assume yesterday') args = parser.parse_args() config = read_dict_from_s3_url(url=args.config) logger = AthenaLogger(app_name="url_validator", s3_bucket=config['aws']['s3-admin'], athena_db=config['aws']['athena-admin']) try: url_validator = URLValidator(s3_admin=config['aws']['s3-admin'], s3_data=config['aws']['s3-data'], athena_data=config['aws']['athena-data']) url_validator.expand_urls(creation_date=args.creation_date) total, used, free = shutil.disk_usage("/") logging.info( "Disk Usage: total: %.1f Gb - used: %.1f Gb - free: %.1f Gb", total / (2**30), used / (2**30), free / (2**30)) finally: logger.save_to_s3()
def main(): logger = AthenaLogger(app_name="youtube_data_export_r", s3_bucket='internet-scholar-admin', athena_db='internet_scholar_admin') try: import_data(related_date=date(2019, 10, 20), end_related_date=date(2020, 2, 20), graph_date_difference=0, timespan=60) finally: logger.save_to_s3()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='S3 Bucket with configuration', required=True) args = parser.parse_args() config = read_dict_from_s3_url(url=args.config) logger = AthenaLogger(app_name="youtube-video-snippet", s3_bucket=config['aws']['s3-admin'], athena_db=config['aws']['athena-admin']) try: youtube_video_snippet = YoutubeVideoSnippet(credentials=config['youtube'], athena_data=config['aws']['athena-data'], s3_admin=config['aws']['s3-admin'], s3_data=config['aws']['s3-data']) youtube_video_snippet.collect_video_snippets() #youtube_video_snippet.collect_complementary_video_snippets() finally: logger.save_to_s3()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='S3 Bucket with configuration', required=True) args = parser.parse_args() config = read_dict_from_s3_url(url=args.config) logger = AthenaLogger(app_name="youtube-channel-stats", s3_bucket=config['aws']['s3-admin'], athena_db=config['aws']['athena-admin']) try: youtube_channel_stats = YoutubeChannelStats( credentials=config['youtube'], athena_data=config['aws']['athena-data'], s3_admin=config['aws']['s3-admin'], s3_data=config['aws']['s3-data']) youtube_channel_stats.collect_channel_stats() finally: logger.save_to_s3()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='S3 Bucket with configuration', required=True) parser.add_argument('-m', '--method', help='twint or tweepy?', required=True) args = parser.parse_args() config = read_dict_from_s3_url(url=args.config) logger = AthenaLogger(app_name="twitter-search", s3_bucket=config['aws']['s3-admin'], athena_db=config['aws']['athena-admin']) try: twitter_search = TwitterSearch( credentials=config['twitter'], athena_data=config['aws']['athena-data'], s3_admin=config['aws']['s3-admin'], s3_data=config['aws']['s3-data']) twitter_search.collect_ancillary_tweets( filter_name=config['parameter']['filter'], method=args.method) #twitter_search.update_table_youtube_twitter_addition() finally: logger.save_to_s3() logger.recreate_athena_table()
def main(): logger = AthenaLogger(app_name="youtube_analysis", s3_bucket='internet-scholar-admin', athena_db='internet_scholar_admin') try: min_users = 3 timespan = 60 graph_date_difference = 0 # final_date = date(2019, 10, 13) # end = date(2019, 10, 14) # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_nodes(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_gexf(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_classification_tables(min_users=min_users, timespan=timespan, # related_date=final_date, end_related_date=end, # graph_date_difference=graph_date_difference) final_date = date(2019, 10, 15) end = date(2019, 10, 31) # # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_nodes(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_gexf(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_classification_tables(min_users=min_users, timespan=timespan, # related_date=final_date, end_related_date=end, # graph_date_difference=graph_date_difference) # final_date = date(2019, 11, 1) end = date(2019, 11, 30) # # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_nodes(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_gexf(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_classification_tables(min_users=min_users, timespan=timespan, # related_date=final_date, end_related_date=end, # graph_date_difference=graph_date_difference) # final_date = date(2019, 12, 1) end = date(2019, 12, 31) # # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_nodes(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_gexf(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_classification_tables(min_users=min_users, timespan=timespan, # related_date=final_date, end_related_date=end, # graph_date_difference=graph_date_difference) final_date = date(2020, 1, 1) end = date(2020, 1, 31) # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_nodes(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_gexf(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_classification_tables(min_users=min_users, timespan=timespan, # related_date=final_date, end_related_date=end, # graph_date_difference=graph_date_difference) final_date = date(2020, 2, 1) end = date(2020, 2, 20) # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_nodes(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end) create_gexf(min_users=min_users, timespan=timespan, final_date=final_date, end=end) # create_classification_tables(min_users=min_users, timespan=timespan, # related_date=final_date, end_related_date=end, # graph_date_difference=graph_date_difference) finally: logger.save_to_s3()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-b', '--bucket', help='S3 Bucket with data', required=True) parser.add_argument('-i', '--identifier', help='File identifier on the cloud', required=True) parser.add_argument('-l', '--language', help='Audio language', required=True) parser.add_argument('-s', '--speaker', help="Speaker's name", required=True) parser.add_argument( '-t', '--speaker_type', help="Speaker's type (interviewee, interviewer, single, both)", required=True) parser.add_argument('-d', '--performance_date', help="Performance date", required=True) parser.add_argument('-r', '--part', help="Part", required=True) parser.add_argument('-m', '--timeframe', help="Timeframe", required=True) parser.add_argument('-c', '--section', help="Section", required=True) parser.add_argument('-p', '--project', help="Project", required=True) parser.add_argument('-v', '--service', help="Service (aws, microsoft, google, ibm)", required=True) args = parser.parse_args() config = read_dict_from_s3(bucket=args.bucket, key='config/config.json') logger = AthenaLogger( app_name= f"transcribe_{args.service}_{args.project}_{args.speaker}_{args.speaker_type}_{args.performance_date}_{args.part}_{args.timeframe}_{args.section}", s3_bucket=args.bucket, athena_db=config['aws']['athena']) try: if args.service == "microsoft": from transcribe_microsoft import retrieve_transcript, delete_uploaded_file elif args.service == "google": from transcribe_google import retrieve_transcript, delete_uploaded_file elif args.service == "aws": from transcribe_aws import retrieve_transcript, delete_uploaded_file elif args.service == "ibm": from transcribe_ibm import retrieve_transcript, delete_uploaded_file else: raise Exception(f"Invalid service: {args.service}") try: logging.info(f'Retrieve transcript on {args.service}') metadata = { 'started_at': str(datetime.datetime.utcnow()), 'language': args.language, 'audio_storage': args.identifier } transcript = retrieve_transcript( identifier=args.identifier, language=args.language, speaker_type=args.speaker_type, service_config=config[args.service]) metadata['finished_at'] = str(datetime.datetime.utcnow()) transcript['metadata_internet_scholar'] = metadata logging.info(f'Succesfully retrieved transcript on {args.service}') partitions = OrderedDict() partitions['service'] = args.service partitions['project'] = args.project partitions['speaker'] = args.speaker partitions['performance_date'] = args.performance_date partitions['part'] = args.part partitions['speaker_type'] = args.speaker_type partitions['timeframe'] = args.timeframe partitions['section'] = args.section logging.info(f'Save transcript on S3') save_data_in_s3(content=transcript, s3_bucket=args.bucket, s3_key='transcript.json', prefix='transcript', partitions=partitions) finally: delete_uploaded_file(args.identifier, config[args.service]) finally: logger.save_to_s3()