Esempio n. 1
0
def main(argv):
    config = config_utils.get_config(argv[0])
    es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME']
    logging.info("Creating ad screener elasticsearch pages template.")
    create_pages_template(es_cluster_name)
    logging.info("Creating ad screener elasticsearch ad creatives template.")
    create_ads_template(es_cluster_name)
def refresh_frame_list():

    if len(argv) == 1:
        configfile = input('Please enter the path to the configuration file: ')
    else:
        configfile = argv[1]

    config = config_utils.get_config(configfile)

    frames_dict = {}

    file_list = glob.glob(path.join(config['data_download_dir'], '*.fits'))
    for file_path in file_list:
        header = retrieve_image_header(file_path)
        f = Frame(header=header)
        frames_dict[path.basename(file_path)] = f

    dir_list = glob.glob(path.join(config['data_reduction_dir'], '*'))
    for dir in dir_list:
        if path.isdir(dir) and path.isdir(path.join(dir, 'data')):
            file_list = glob.glob(path.join(dir, 'data', '*.fits'))
            for file_path in file_list:
                header = retrieve_image_header(file_path)
                f = Frame(header=header)
                f.filename = path.basename(file_path).replace('.fz', '')
                if path.basename(file_path) not in frames_dict.keys():
                    frames_dict[path.basename(file_path)] = f

    args = {'frame_list': 'updated_frame_list.txt'}
    output_frame_list(args, frames_dict, log=None)

    print('Updated frame list output to ./updated_frame_list.txt')
Esempio n. 3
0
def main(argv):
    config = config_utils.get_config(argv[0])
    db_connection_params = config_utils.get_database_connection_params_from_config(
        config)
    es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME']
    pages_index_name = config['ELASTIC_SEARCH']['PAGES_INDEX_NAME']
    ad_creatives_index_name = config['ELASTIC_SEARCH'][
        'AD_CREATIVES_INDEX_NAME']
    move_pages_to_es(db_connection_params, es_cluster_name, pages_index_name)
    move_ads_to_es(db_connection_params, es_cluster_name,
                   ad_creatives_index_name)
Esempio n. 4
0
def search_archive_for_data(CONFIG_FILE):

    config = config_utils.get_config(CONFIG_FILE)

    log = log_utils.start_day_log(config, 'data_download')

    downloaded_frames = read_frame_list(config, log)

    (start_time, end_time) = set_date_range(config, log)

    new_frames = fetch_new_frames(config, start_time, end_time, log)

    downloaded_frames = download_new_frames(config, new_frames,
                                            downloaded_frames, log)

    framelist_utils.output_frame_list(config, downloaded_frames, log)

    log_utils.close_log(log)
Esempio n. 5
0
def prepare_data_for_reduction(CONFIG_FILE):

    config = config_utils.get_config(CONFIG_FILE)

    log = log_utils.start_day_log(config, 'data_preparation')

    compressed_frames = check_for_new_frames(config, log)

    if len(compressed_frames) > 0:

        decompressed_frames = decompress_new_frames(config, log, compressed_frames)

        #transform_frames(decompressed_frames, log)

        sort_data.sort_data(config['data_download_dir'],config['separate_instruments'],log=log)

        datasets = get_dataset_list(config, log)

        for dataset_dir in datasets:
            transfer_to_reduction_directory(config, log, dataset_dir)

    log_utils.close_log(log)
    except Exception as e:
        completion_status = f'Uncaught exception: {e}'
        logging.error(completion_status, exc_info=True)
    finally:
        end_time = datetime.datetime.now()
        num_ads_added = search_runner.num_ads_added_to_db()
        num_impressions_added = search_runner.num_impressions_added_to_db()
        if not min_expected_ads_or_impressions_met(
                num_ads_added, min_expected_new_ads, num_impressions_added,
                min_expected_new_impressions):

            # log to error channel because num expected ads or impressions not met
            slack_url_for_completion_msg = slack_url_error_channel
        logging.info(search_runner.get_formatted_graph_error_counts())
        send_completion_slack_notification(
            slack_url_for_completion_msg, country_code_uppercase,
            completion_status, start_time, end_time, num_ads_added,
            num_impressions_added, min_expected_new_ads,
            min_expected_new_impressions,
            search_runner.get_formatted_graph_error_counts())


if __name__ == '__main__':
    config = config_utils.get_config(sys.argv[1])
    country_code = config['SEARCH']['COUNTRY_CODE'].lower()

    config_utils.configure_logger(f"{country_code}_fb_api_collection.log")
    if len(sys.argv) < 2:
        exit(f"Usage:python3 {sys.argv[0]} generic_fb_collector.cfg")
    main(config)
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_path',
                        dest='config_path',
                        required=True,
                        help='Configuration file path')
    parser.add_argument(
        '--dry_run',
        dest='dry_run',
        action='store_true',
        default=False,
        required=False,
        help=
        'If true, do not write output to database, and print output instead')
    known_args, pipeline_args = parser.parse_known_args(argv)

    if known_args.dry_run:
        logging.info(
            'DRY RUN, will not write output to database, and will print output to stdout.'
        )

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    config = config_utils.get_config(known_args.config_path)
    database_connection_params = config_utils.get_database_connection_params_from_config(
        config)
    fetch_args_list = get_dashboards_fetch_args(config,
                                                database_connection_params)

    logging.info('About to start crowdtangle fetch pipline with args: %s',
                 fetch_args_list)
    with beam.Pipeline(options=pipeline_options) as pipeline:
        results, errors = (pipeline | beam.Create(fetch_args_list)
                           | 'Fetch CrowdTangle results' >>
                           fetch_crowdtangle.FetchCrowdTangle())

        processed_results = (
            results
            | 'Transform CrowdTangle for SQL' >> beam.ParDo(
                process_crowdtangle_posts.ProcessCrowdTanglePosts())
            | 'Batch CrowdTangle results transformed for SQL' >>
            beam.transforms.util.BatchElements(min_batch_size=10,
                                               max_batch_size=500))

        if known_args.dry_run:

            def print_row(row):
                print(row)
                return row

            processed_results | beam.Map(print_row)

        else:
            (processed_results
             | 'Write processed results to Database' >> beam.ParDo(
                 write_crowdtangle_results_to_database.
                 WriteCrowdTangleResultsToDatabase(database_connection_params))
             )