def main(argv): config = config_utils.get_config(argv[0]) es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME'] logging.info("Creating ad screener elasticsearch pages template.") create_pages_template(es_cluster_name) logging.info("Creating ad screener elasticsearch ad creatives template.") create_ads_template(es_cluster_name)
def refresh_frame_list(): if len(argv) == 1: configfile = input('Please enter the path to the configuration file: ') else: configfile = argv[1] config = config_utils.get_config(configfile) frames_dict = {} file_list = glob.glob(path.join(config['data_download_dir'], '*.fits')) for file_path in file_list: header = retrieve_image_header(file_path) f = Frame(header=header) frames_dict[path.basename(file_path)] = f dir_list = glob.glob(path.join(config['data_reduction_dir'], '*')) for dir in dir_list: if path.isdir(dir) and path.isdir(path.join(dir, 'data')): file_list = glob.glob(path.join(dir, 'data', '*.fits')) for file_path in file_list: header = retrieve_image_header(file_path) f = Frame(header=header) f.filename = path.basename(file_path).replace('.fz', '') if path.basename(file_path) not in frames_dict.keys(): frames_dict[path.basename(file_path)] = f args = {'frame_list': 'updated_frame_list.txt'} output_frame_list(args, frames_dict, log=None) print('Updated frame list output to ./updated_frame_list.txt')
def main(argv): config = config_utils.get_config(argv[0]) db_connection_params = config_utils.get_database_connection_params_from_config( config) es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME'] pages_index_name = config['ELASTIC_SEARCH']['PAGES_INDEX_NAME'] ad_creatives_index_name = config['ELASTIC_SEARCH'][ 'AD_CREATIVES_INDEX_NAME'] move_pages_to_es(db_connection_params, es_cluster_name, pages_index_name) move_ads_to_es(db_connection_params, es_cluster_name, ad_creatives_index_name)
def search_archive_for_data(CONFIG_FILE): config = config_utils.get_config(CONFIG_FILE) log = log_utils.start_day_log(config, 'data_download') downloaded_frames = read_frame_list(config, log) (start_time, end_time) = set_date_range(config, log) new_frames = fetch_new_frames(config, start_time, end_time, log) downloaded_frames = download_new_frames(config, new_frames, downloaded_frames, log) framelist_utils.output_frame_list(config, downloaded_frames, log) log_utils.close_log(log)
def prepare_data_for_reduction(CONFIG_FILE): config = config_utils.get_config(CONFIG_FILE) log = log_utils.start_day_log(config, 'data_preparation') compressed_frames = check_for_new_frames(config, log) if len(compressed_frames) > 0: decompressed_frames = decompress_new_frames(config, log, compressed_frames) #transform_frames(decompressed_frames, log) sort_data.sort_data(config['data_download_dir'],config['separate_instruments'],log=log) datasets = get_dataset_list(config, log) for dataset_dir in datasets: transfer_to_reduction_directory(config, log, dataset_dir) log_utils.close_log(log)
except Exception as e: completion_status = f'Uncaught exception: {e}' logging.error(completion_status, exc_info=True) finally: end_time = datetime.datetime.now() num_ads_added = search_runner.num_ads_added_to_db() num_impressions_added = search_runner.num_impressions_added_to_db() if not min_expected_ads_or_impressions_met( num_ads_added, min_expected_new_ads, num_impressions_added, min_expected_new_impressions): # log to error channel because num expected ads or impressions not met slack_url_for_completion_msg = slack_url_error_channel logging.info(search_runner.get_formatted_graph_error_counts()) send_completion_slack_notification( slack_url_for_completion_msg, country_code_uppercase, completion_status, start_time, end_time, num_ads_added, num_impressions_added, min_expected_new_ads, min_expected_new_impressions, search_runner.get_formatted_graph_error_counts()) if __name__ == '__main__': config = config_utils.get_config(sys.argv[1]) country_code = config['SEARCH']['COUNTRY_CODE'].lower() config_utils.configure_logger(f"{country_code}_fb_api_collection.log") if len(sys.argv) < 2: exit(f"Usage:python3 {sys.argv[0]} generic_fb_collector.cfg") main(config)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--config_path', dest='config_path', required=True, help='Configuration file path') parser.add_argument( '--dry_run', dest='dry_run', action='store_true', default=False, required=False, help= 'If true, do not write output to database, and print output instead') known_args, pipeline_args = parser.parse_known_args(argv) if known_args.dry_run: logging.info( 'DRY RUN, will not write output to database, and will print output to stdout.' ) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session config = config_utils.get_config(known_args.config_path) database_connection_params = config_utils.get_database_connection_params_from_config( config) fetch_args_list = get_dashboards_fetch_args(config, database_connection_params) logging.info('About to start crowdtangle fetch pipline with args: %s', fetch_args_list) with beam.Pipeline(options=pipeline_options) as pipeline: results, errors = (pipeline | beam.Create(fetch_args_list) | 'Fetch CrowdTangle results' >> fetch_crowdtangle.FetchCrowdTangle()) processed_results = ( results | 'Transform CrowdTangle for SQL' >> beam.ParDo( process_crowdtangle_posts.ProcessCrowdTanglePosts()) | 'Batch CrowdTangle results transformed for SQL' >> beam.transforms.util.BatchElements(min_batch_size=10, max_batch_size=500)) if known_args.dry_run: def print_row(row): print(row) return row processed_results | beam.Map(print_row) else: (processed_results | 'Write processed results to Database' >> beam.ParDo( write_crowdtangle_results_to_database. WriteCrowdTangleResultsToDatabase(database_connection_params)) )