input_path = get_path_raw(
    CSV_LIDO_ECLIS_FAILED) if args.failed else get_path_raw(CSV_RS_INDEX)
output_path_c_citations = get_path_raw(CSV_CASE_CITATIONS)
output_path_l_citations = get_path_raw(CSV_LEGISLATION_CITATIONS)

print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUT:\t\t\t\t', basename(input_path))
print(
    'OUTPUTS:\t\t\t',
    f'{basename(output_path_c_citations)}, {basename(output_path_l_citations)}\n'
)
storage = Storage(location=args.storage)
storage.setup_pipeline(
    output_paths=[output_path_c_citations, output_path_l_citations],
    input_path=input_path)
citation_type = "inkomende-links" if args.incoming else "uitgaande-links"
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

LIDO_ENDPOINT = os.getenv('LIDO_ENDPOINT')
LIDO_USERNAME = os.getenv('LIDO_USERNAME')
LIDO_PASSWORD = os.getenv('LIDO_PASSWORD')

case_citations_fieldnames = [
    ECLI, LIDO_JURISPRUDENTIE, LIDO_LABEL, LIDO_TYPE, RS_RELATION, 'keep1',
    'keep2', RS_DATE
]
Beispiel #2
0
parser.add_argument('storage',
                    choices=['local', 'aws'],
                    help='location to save output data to')
parser.add_argument('--count',
                    help='number of documents to retrieve',
                    type=int,
                    required=False)
args = parser.parse_args()

# set up locations
print('\n--- PREPARATION ---\n')
print('OUTPUT DATA STORAGE:\t', args.storage)
print('OUTPUT:\t\t\t', CSV_ECHR_CASES)

storage = Storage(location=args.storage)
storage.setup_pipeline(output_paths=[CSV_ECHR_CASES])

last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---')
start = time.time()

print("--- Extract ECHR data")
arg_end_id = args.count if args.count else None
df, resultcount = read_echr_metadata(
    end_id=arg_end_id,
    fields=['itemid', 'documentcollectionid2', 'languageisocode'],
    verbose=True)

print(df)
start = time.time()

output_path = DIR_RECHTSPRAAK + '.zip'

# set up storage location
parser = argparse.ArgumentParser()
parser.add_argument('storage',
                    choices=['local', 'aws'],
                    help='location to save output data to')
args = parser.parse_args()
print('\n--- PREPARATION ---\n')
print('OUTPUT DATA STORAGE:\t', args.storage)
print('OUTPUT:\t\t\t', basename(output_path))
storage = Storage(location=args.storage)
storage.setup_pipeline(output_paths=[output_path])
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

if getenv('SAMPLE_TEST') == 'TRUE':
    rs_url = getenv('URL_RS_ARCHIVE_SAMPLE')
else:
    rs_url = getenv('URL_RS_ARCHIVE')

dateTimeObj = datetime.now()
date = str(dateTimeObj.year) + '-' + str(dateTimeObj.month) + '-' + str(
    dateTimeObj.day)

print("Downloading Rechtspraak.nl dump - " + date + " - " + rs_url + " ...")
parser = argparse.ArgumentParser()
parser.add_argument(
    'storage',
    choices=['local', 'aws'],
    help='location to take input data from and save output data to')
args = parser.parse_args()
print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUT:\t\t\t\t', basename(input_path))
print(
    'OUTPUTS:\t\t\t',
    f'{basename(output_path_cases)}, {basename(output_path_opinions)}, {basename(output_path_index)}\n'
)
storage = Storage(location=args.storage)
storage.setup_pipeline(
    output_paths=[output_path_cases, output_path_opinions, output_path_index],
    input_path=input_path)
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

is_case = False

case_counter = 0
opinion_counter = 0
datarecord = dict()

# Field names used for output csv. Field names correspond to tags of original data
IDENTIFIER = 'ecli'
ISSUED = 'issued'
args = parser.parse_args()

print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUTS:\t\t\t\t', [basename(input_path) for input_path in input_paths])
print('OUTPUTS:\t\t\t', [
    basename(get_path_processed(basename(input_path)))
    for input_path in input_paths
], '\n')

# run data transformation for each input file
for input_path in input_paths:
    file_name = basename(input_path)
    output_path = get_path_processed(file_name)
    print(f'\n--- PREPARATION {file_name} ---\n')
    storage = Storage(location=args.storage)
    storage.setup_pipeline(output_paths=[output_path], input_path=input_path)
    last_updated = storage.pipeline_last_updated
    print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

    print(f'\n--- START {file_name} ---\n')

    field_map = field_maps.get(input_path)
    tool_map = tool_maps.get(input_path)

    with open(output_path, 'a', newline='') as out_file:
        writer = DictWriter(out_file, fieldnames=list(field_map.values()))
        writer.writeheader()

        with open(input_path, 'r', newline='') as in_file:
            reader = DictReader(in_file)
            # process input file by row
output_path_dir = DIR_RECHTSPRAAK
output_path_index = CSV_OPENDATA_INDEX

parser = argparse.ArgumentParser()
parser.add_argument(
    'storage',
    choices=['local', 'aws'],
    help='location to take input data from and save output data to')
args = parser.parse_args()
print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUT:\t\t\t\t', basename(input_path))
print('OUTPUTS:\t\t\t',
      f'{basename(output_path_dir)}, {basename(output_path_index)}\n')
storage = Storage(location=args.storage)
storage.setup_pipeline(output_paths=[output_path_dir, output_path_index],
                       input_path=input_path)
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

# extract all files in directory "filename" and all subdirectories:
print('Extracting directories...')
outer_zip = zipfile.ZipFile(input_path)
# for each year directory in dataset create folder structure
for outer_file in outer_zip.namelist():
    if outer_file.endswith('.zip'):
        year, month = splitext(outer_file)[0][:4], splitext(outer_file)[0][-2:]
        if int(year) < last_updated.year:
            continue
        # create new directory