Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        help='S3 Bucket with configuration',
                        required=True)
    parser.add_argument(
        '--creation_date',
        help='If not specified, creation_date will be the day before yesterday'
    )
    args = parser.parse_args()

    config = read_dict_from_s3_url(url=args.config)
    logger = AthenaLogger(app_name="youtube_related_video",
                          s3_bucket=config['aws']['s3-admin'],
                          athena_db=config['aws']['athena-admin'])
    try:
        youtube_related_video = YoutubeRelatedVideo(
            credentials=config['youtube'],
            athena_data=config['aws']['athena-data'],
            s3_admin=config['aws']['s3-admin'],
            s3_data=config['aws']['s3-data'])
        youtube_related_video.collect_related_video(
            region_code=config['parameter']['region_code'],
            creation_date=args.creation_date)
    finally:
        logger.save_to_s3()
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        help='S3 Bucket with configuration',
                        required=True)
    parser.add_argument(
        '--creation_date',
        help='If specified, script will validate URLs for given date; '
        'otherwise, it will assume yesterday')
    args = parser.parse_args()

    config = read_dict_from_s3_url(url=args.config)
    logger = AthenaLogger(app_name="url_validator",
                          s3_bucket=config['aws']['s3-admin'],
                          athena_db=config['aws']['athena-admin'])
    try:
        url_validator = URLValidator(s3_admin=config['aws']['s3-admin'],
                                     s3_data=config['aws']['s3-data'],
                                     athena_data=config['aws']['athena-data'])
        url_validator.expand_urls(creation_date=args.creation_date)

        total, used, free = shutil.disk_usage("/")
        logging.info(
            "Disk Usage: total: %.1f Gb - used: %.1f Gb - free: %.1f Gb",
            total / (2**30), used / (2**30), free / (2**30))
    finally:
        logger.save_to_s3()
def main():
    logger = AthenaLogger(app_name="youtube_data_export_r",
                          s3_bucket='internet-scholar-admin',
                          athena_db='internet_scholar_admin')
    try:
        import_data(related_date=date(2019, 10, 20),
                    end_related_date=date(2020, 2, 20),
                    graph_date_difference=0,
                    timespan=60)
    finally:
        logger.save_to_s3()
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', help='S3 Bucket with configuration', required=True)
    args = parser.parse_args()

    config = read_dict_from_s3_url(url=args.config)
    logger = AthenaLogger(app_name="youtube-video-snippet",
                          s3_bucket=config['aws']['s3-admin'],
                          athena_db=config['aws']['athena-admin'])
    try:
        youtube_video_snippet = YoutubeVideoSnippet(credentials=config['youtube'],
                                                    athena_data=config['aws']['athena-data'],
                                                    s3_admin=config['aws']['s3-admin'],
                                                    s3_data=config['aws']['s3-data'])
        youtube_video_snippet.collect_video_snippets()
        #youtube_video_snippet.collect_complementary_video_snippets()
    finally:
        logger.save_to_s3()
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        help='S3 Bucket with configuration',
                        required=True)
    args = parser.parse_args()

    config = read_dict_from_s3_url(url=args.config)
    logger = AthenaLogger(app_name="youtube-channel-stats",
                          s3_bucket=config['aws']['s3-admin'],
                          athena_db=config['aws']['athena-admin'])
    try:
        youtube_channel_stats = YoutubeChannelStats(
            credentials=config['youtube'],
            athena_data=config['aws']['athena-data'],
            s3_admin=config['aws']['s3-admin'],
            s3_data=config['aws']['s3-data'])
        youtube_channel_stats.collect_channel_stats()
    finally:
        logger.save_to_s3()
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        help='S3 Bucket with configuration',
                        required=True)
    parser.add_argument('-m',
                        '--method',
                        help='twint or tweepy?',
                        required=True)
    args = parser.parse_args()

    config = read_dict_from_s3_url(url=args.config)
    logger = AthenaLogger(app_name="twitter-search",
                          s3_bucket=config['aws']['s3-admin'],
                          athena_db=config['aws']['athena-admin'])
    try:
        twitter_search = TwitterSearch(
            credentials=config['twitter'],
            athena_data=config['aws']['athena-data'],
            s3_admin=config['aws']['s3-admin'],
            s3_data=config['aws']['s3-data'])
        twitter_search.collect_ancillary_tweets(
            filter_name=config['parameter']['filter'], method=args.method)
        #twitter_search.update_table_youtube_twitter_addition()
    finally:
        logger.save_to_s3()
        logger.recreate_athena_table()
Esempio n. 7
0
def main():
    logger = AthenaLogger(app_name="youtube_analysis",
                          s3_bucket='internet-scholar-admin',
                          athena_db='internet_scholar_admin')
    try:
        min_users = 3
        timespan = 60
        graph_date_difference = 0

        # final_date = date(2019, 10, 13)
        # end = date(2019, 10, 14)
        # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        # create_nodes(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        # create_gexf(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        # create_classification_tables(min_users=min_users, timespan=timespan,
        #                              related_date=final_date, end_related_date=end,
        #                              graph_date_difference=graph_date_difference)

        final_date = date(2019, 10, 15)
        end = date(2019, 10, 31)
        # # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_nodes(min_users=min_users,
                     timespan=timespan,
                     final_date=final_date,
                     end=end)
        # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_gexf(min_users=min_users,
                    timespan=timespan,
                    final_date=final_date,
                    end=end)
        # create_classification_tables(min_users=min_users, timespan=timespan,
        #                              related_date=final_date, end_related_date=end,
        #                              graph_date_difference=graph_date_difference)
        #
        final_date = date(2019, 11, 1)
        end = date(2019, 11, 30)
        # # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_nodes(min_users=min_users,
                     timespan=timespan,
                     final_date=final_date,
                     end=end)
        # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_gexf(min_users=min_users,
                    timespan=timespan,
                    final_date=final_date,
                    end=end)
        # create_classification_tables(min_users=min_users, timespan=timespan,
        #                              related_date=final_date, end_related_date=end,
        #                              graph_date_difference=graph_date_difference)
        #
        final_date = date(2019, 12, 1)
        end = date(2019, 12, 31)
        # # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_nodes(min_users=min_users,
                     timespan=timespan,
                     final_date=final_date,
                     end=end)
        # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_gexf(min_users=min_users,
                    timespan=timespan,
                    final_date=final_date,
                    end=end)
        # create_classification_tables(min_users=min_users, timespan=timespan,
        #                              related_date=final_date, end_related_date=end,
        #                              graph_date_difference=graph_date_difference)

        final_date = date(2020, 1, 1)
        end = date(2020, 1, 31)
        # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_nodes(min_users=min_users,
                     timespan=timespan,
                     final_date=final_date,
                     end=end)
        # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_gexf(min_users=min_users,
                    timespan=timespan,
                    final_date=final_date,
                    end=end)
        # create_classification_tables(min_users=min_users, timespan=timespan,
        #                              related_date=final_date, end_related_date=end,
        #                              graph_date_difference=graph_date_difference)

        final_date = date(2020, 2, 1)
        end = date(2020, 2, 20)
        # create_edges(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_nodes(min_users=min_users,
                     timespan=timespan,
                     final_date=final_date,
                     end=end)
        # create_louvain(min_users=min_users, timespan=timespan, final_date=final_date, end=end)
        create_gexf(min_users=min_users,
                    timespan=timespan,
                    final_date=final_date,
                    end=end)
        # create_classification_tables(min_users=min_users, timespan=timespan,
        #                              related_date=final_date, end_related_date=end,
        #                              graph_date_difference=graph_date_difference)
    finally:
        logger.save_to_s3()
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-b',
                        '--bucket',
                        help='S3 Bucket with data',
                        required=True)
    parser.add_argument('-i',
                        '--identifier',
                        help='File identifier on the cloud',
                        required=True)
    parser.add_argument('-l',
                        '--language',
                        help='Audio language',
                        required=True)
    parser.add_argument('-s',
                        '--speaker',
                        help="Speaker's name",
                        required=True)
    parser.add_argument(
        '-t',
        '--speaker_type',
        help="Speaker's type (interviewee, interviewer, single, both)",
        required=True)
    parser.add_argument('-d',
                        '--performance_date',
                        help="Performance date",
                        required=True)
    parser.add_argument('-r', '--part', help="Part", required=True)
    parser.add_argument('-m', '--timeframe', help="Timeframe", required=True)
    parser.add_argument('-c', '--section', help="Section", required=True)
    parser.add_argument('-p', '--project', help="Project", required=True)
    parser.add_argument('-v',
                        '--service',
                        help="Service (aws, microsoft, google, ibm)",
                        required=True)
    args = parser.parse_args()

    config = read_dict_from_s3(bucket=args.bucket, key='config/config.json')

    logger = AthenaLogger(
        app_name=
        f"transcribe_{args.service}_{args.project}_{args.speaker}_{args.speaker_type}_{args.performance_date}_{args.part}_{args.timeframe}_{args.section}",
        s3_bucket=args.bucket,
        athena_db=config['aws']['athena'])

    try:
        if args.service == "microsoft":
            from transcribe_microsoft import retrieve_transcript, delete_uploaded_file
        elif args.service == "google":
            from transcribe_google import retrieve_transcript, delete_uploaded_file
        elif args.service == "aws":
            from transcribe_aws import retrieve_transcript, delete_uploaded_file
        elif args.service == "ibm":
            from transcribe_ibm import retrieve_transcript, delete_uploaded_file
        else:
            raise Exception(f"Invalid service: {args.service}")

        try:
            logging.info(f'Retrieve transcript on {args.service}')
            metadata = {
                'started_at': str(datetime.datetime.utcnow()),
                'language': args.language,
                'audio_storage': args.identifier
            }
            transcript = retrieve_transcript(
                identifier=args.identifier,
                language=args.language,
                speaker_type=args.speaker_type,
                service_config=config[args.service])
            metadata['finished_at'] = str(datetime.datetime.utcnow())
            transcript['metadata_internet_scholar'] = metadata

            logging.info(f'Succesfully retrieved transcript on {args.service}')
            partitions = OrderedDict()
            partitions['service'] = args.service
            partitions['project'] = args.project
            partitions['speaker'] = args.speaker
            partitions['performance_date'] = args.performance_date
            partitions['part'] = args.part
            partitions['speaker_type'] = args.speaker_type
            partitions['timeframe'] = args.timeframe
            partitions['section'] = args.section
            logging.info(f'Save transcript on S3')
            save_data_in_s3(content=transcript,
                            s3_bucket=args.bucket,
                            s3_key='transcript.json',
                            prefix='transcript',
                            partitions=partitions)
        finally:
            delete_uploaded_file(args.identifier, config[args.service])
    finally:
        logger.save_to_s3()