help='fetch incoming citations instead of outgoing') args = parser.parse_args() input_path = get_path_raw( CSV_LIDO_ECLIS_FAILED) if args.failed else get_path_raw(CSV_RS_INDEX) output_path_c_citations = get_path_raw(CSV_CASE_CITATIONS) output_path_l_citations = get_path_raw(CSV_LEGISLATION_CITATIONS) print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUT:\t\t\t\t', basename(input_path)) print( 'OUTPUTS:\t\t\t', f'{basename(output_path_c_citations)}, {basename(output_path_l_citations)}\n' ) storage = Storage(location=args.storage) storage.setup_pipeline( output_paths=[output_path_c_citations, output_path_l_citations], input_path=input_path) citation_type = "inkomende-links" if args.incoming else "uitgaande-links" last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') LIDO_ENDPOINT = os.getenv('LIDO_ENDPOINT') LIDO_USERNAME = os.getenv('LIDO_USERNAME') LIDO_PASSWORD = os.getenv('LIDO_PASSWORD') case_citations_fieldnames = [ ECLI, LIDO_JURISPRUDENTIE, LIDO_LABEL, LIDO_TYPE, RS_RELATION, 'keep1',
parser = argparse.ArgumentParser() parser.add_argument('storage', choices=['local', 'aws'], help='location to save output data to') parser.add_argument('--count', help='number of documents to retrieve', type=int, required=False) args = parser.parse_args() # set up locations print('\n--- PREPARATION ---\n') print('OUTPUT DATA STORAGE:\t', args.storage) print('OUTPUT:\t\t\t', CSV_ECHR_CASES) storage = Storage(location=args.storage) storage.setup_pipeline(output_paths=[CSV_ECHR_CASES]) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---') start = time.time() print("--- Extract ECHR data") arg_end_id = args.count if args.count else None df, resultcount = read_echr_metadata( end_id=arg_end_id, fields=['itemid', 'documentcollectionid2', 'languageisocode'], verbose=True)
output_path_index = get_path_raw(CSV_RS_INDEX) parser = argparse.ArgumentParser() parser.add_argument( 'storage', choices=['local', 'aws'], help='location to take input data from and save output data to') args = parser.parse_args() print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUT:\t\t\t\t', basename(input_path)) print( 'OUTPUTS:\t\t\t', f'{basename(output_path_cases)}, {basename(output_path_opinions)}, {basename(output_path_index)}\n' ) storage = Storage(location=args.storage) storage.setup_pipeline( output_paths=[output_path_cases, output_path_opinions, output_path_index], input_path=input_path) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') is_case = False case_counter = 0 opinion_counter = 0 datarecord = dict() # Field names used for output csv. Field names correspond to tags of original data
import argparse start = time.time() output_path = DIR_RECHTSPRAAK + '.zip' # set up storage location parser = argparse.ArgumentParser() parser.add_argument('storage', choices=['local', 'aws'], help='location to save output data to') args = parser.parse_args() print('\n--- PREPARATION ---\n') print('OUTPUT DATA STORAGE:\t', args.storage) print('OUTPUT:\t\t\t', basename(output_path)) storage = Storage(location=args.storage) storage.setup_pipeline(output_paths=[output_path]) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') if getenv('SAMPLE_TEST') == 'TRUE': rs_url = getenv('URL_RS_ARCHIVE_SAMPLE') else: rs_url = getenv('URL_RS_ARCHIVE') dateTimeObj = datetime.now() date = str(dateTimeObj.year) + '-' + str(dateTimeObj.month) + '-' + str( dateTimeObj.day)
# evaluate input arguments if args.delete == 'ddb': # remove all items from table without deleting table itself ddb_client.truncate_table() elif args.delete == 'os': # delete OpenSearch index os_client.es.indices.delete(os_client.index_name) else: # process each input csv for input_path in input_paths: # prepare storage print(f'\n--- PREPARATION {basename(input_path)} ---\n') storage = Storage(location='aws') storage.fetch_data([input_path]) last_updated = storage.fetch_last_updated([input_path]) print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print(f'\n--- START {basename(input_path)} ---\n') print(f'Processing {basename(input_path)} ...') # initialize row processors and counters if args.partial != 'os': ddb_rp = DynamoDBRowProcessor(input_path, ddb_client.table) if args.partial != 'ddb': os_rp = OpenSearchRowProcessor(input_path, os_client) case_counter = 0 ddb_item_counter = 0 os_item_counter = 0