input_path = get_path_raw( CSV_LIDO_ECLIS_FAILED) if args.failed else get_path_raw(CSV_RS_INDEX) output_path_c_citations = get_path_raw(CSV_CASE_CITATIONS) output_path_l_citations = get_path_raw(CSV_LEGISLATION_CITATIONS) print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUT:\t\t\t\t', basename(input_path)) print( 'OUTPUTS:\t\t\t', f'{basename(output_path_c_citations)}, {basename(output_path_l_citations)}\n' ) storage = Storage(location=args.storage) storage.setup_pipeline( output_paths=[output_path_c_citations, output_path_l_citations], input_path=input_path) citation_type = "inkomende-links" if args.incoming else "uitgaande-links" last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') LIDO_ENDPOINT = os.getenv('LIDO_ENDPOINT') LIDO_USERNAME = os.getenv('LIDO_USERNAME') LIDO_PASSWORD = os.getenv('LIDO_PASSWORD') case_citations_fieldnames = [ ECLI, LIDO_JURISPRUDENTIE, LIDO_LABEL, LIDO_TYPE, RS_RELATION, 'keep1', 'keep2', RS_DATE ]
parser.add_argument('storage', choices=['local', 'aws'], help='location to save output data to') parser.add_argument('--count', help='number of documents to retrieve', type=int, required=False) args = parser.parse_args() # set up locations print('\n--- PREPARATION ---\n') print('OUTPUT DATA STORAGE:\t', args.storage) print('OUTPUT:\t\t\t', CSV_ECHR_CASES) storage = Storage(location=args.storage) storage.setup_pipeline(output_paths=[CSV_ECHR_CASES]) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---') start = time.time() print("--- Extract ECHR data") arg_end_id = args.count if args.count else None df, resultcount = read_echr_metadata( end_id=arg_end_id, fields=['itemid', 'documentcollectionid2', 'languageisocode'], verbose=True) print(df)
start = time.time() output_path = DIR_RECHTSPRAAK + '.zip' # set up storage location parser = argparse.ArgumentParser() parser.add_argument('storage', choices=['local', 'aws'], help='location to save output data to') args = parser.parse_args() print('\n--- PREPARATION ---\n') print('OUTPUT DATA STORAGE:\t', args.storage) print('OUTPUT:\t\t\t', basename(output_path)) storage = Storage(location=args.storage) storage.setup_pipeline(output_paths=[output_path]) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') if getenv('SAMPLE_TEST') == 'TRUE': rs_url = getenv('URL_RS_ARCHIVE_SAMPLE') else: rs_url = getenv('URL_RS_ARCHIVE') dateTimeObj = datetime.now() date = str(dateTimeObj.year) + '-' + str(dateTimeObj.month) + '-' + str( dateTimeObj.day) print("Downloading Rechtspraak.nl dump - " + date + " - " + rs_url + " ...")
parser = argparse.ArgumentParser() parser.add_argument( 'storage', choices=['local', 'aws'], help='location to take input data from and save output data to') args = parser.parse_args() print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUT:\t\t\t\t', basename(input_path)) print( 'OUTPUTS:\t\t\t', f'{basename(output_path_cases)}, {basename(output_path_opinions)}, {basename(output_path_index)}\n' ) storage = Storage(location=args.storage) storage.setup_pipeline( output_paths=[output_path_cases, output_path_opinions, output_path_index], input_path=input_path) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') is_case = False case_counter = 0 opinion_counter = 0 datarecord = dict() # Field names used for output csv. Field names correspond to tags of original data IDENTIFIER = 'ecli' ISSUED = 'issued'
args = parser.parse_args() print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUTS:\t\t\t\t', [basename(input_path) for input_path in input_paths]) print('OUTPUTS:\t\t\t', [ basename(get_path_processed(basename(input_path))) for input_path in input_paths ], '\n') # run data transformation for each input file for input_path in input_paths: file_name = basename(input_path) output_path = get_path_processed(file_name) print(f'\n--- PREPARATION {file_name} ---\n') storage = Storage(location=args.storage) storage.setup_pipeline(output_paths=[output_path], input_path=input_path) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print(f'\n--- START {file_name} ---\n') field_map = field_maps.get(input_path) tool_map = tool_maps.get(input_path) with open(output_path, 'a', newline='') as out_file: writer = DictWriter(out_file, fieldnames=list(field_map.values())) writer.writeheader() with open(input_path, 'r', newline='') as in_file: reader = DictReader(in_file) # process input file by row
output_path_dir = DIR_RECHTSPRAAK output_path_index = CSV_OPENDATA_INDEX parser = argparse.ArgumentParser() parser.add_argument( 'storage', choices=['local', 'aws'], help='location to take input data from and save output data to') args = parser.parse_args() print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUT:\t\t\t\t', basename(input_path)) print('OUTPUTS:\t\t\t', f'{basename(output_path_dir)}, {basename(output_path_index)}\n') storage = Storage(location=args.storage) storage.setup_pipeline(output_paths=[output_path_dir, output_path_index], input_path=input_path) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') # extract all files in directory "filename" and all subdirectories: print('Extracting directories...') outer_zip = zipfile.ZipFile(input_path) # for each year directory in dataset create folder structure for outer_file in outer_zip.namelist(): if outer_file.endswith('.zip'): year, month = splitext(outer_file)[0][:4], splitext(outer_file)[0][-2:] if int(year) < last_updated.year: continue # create new directory