help='fetch incoming citations instead of outgoing')
args = parser.parse_args()

input_path = get_path_raw(
    CSV_LIDO_ECLIS_FAILED) if args.failed else get_path_raw(CSV_RS_INDEX)
output_path_c_citations = get_path_raw(CSV_CASE_CITATIONS)
output_path_l_citations = get_path_raw(CSV_LEGISLATION_CITATIONS)

print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUT:\t\t\t\t', basename(input_path))
print(
    'OUTPUTS:\t\t\t',
    f'{basename(output_path_c_citations)}, {basename(output_path_l_citations)}\n'
)
storage = Storage(location=args.storage)
storage.setup_pipeline(
    output_paths=[output_path_c_citations, output_path_l_citations],
    input_path=input_path)
citation_type = "inkomende-links" if args.incoming else "uitgaande-links"
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

LIDO_ENDPOINT = os.getenv('LIDO_ENDPOINT')
LIDO_USERNAME = os.getenv('LIDO_USERNAME')
LIDO_PASSWORD = os.getenv('LIDO_PASSWORD')

case_citations_fieldnames = [
    ECLI, LIDO_JURISPRUDENTIE, LIDO_LABEL, LIDO_TYPE, RS_RELATION, 'keep1',
Ejemplo n.º 2
0
parser = argparse.ArgumentParser()
parser.add_argument('storage',
                    choices=['local', 'aws'],
                    help='location to save output data to')
parser.add_argument('--count',
                    help='number of documents to retrieve',
                    type=int,
                    required=False)
args = parser.parse_args()

# set up locations
print('\n--- PREPARATION ---\n')
print('OUTPUT DATA STORAGE:\t', args.storage)
print('OUTPUT:\t\t\t', CSV_ECHR_CASES)

storage = Storage(location=args.storage)
storage.setup_pipeline(output_paths=[CSV_ECHR_CASES])

last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---')
start = time.time()

print("--- Extract ECHR data")
arg_end_id = args.count if args.count else None
df, resultcount = read_echr_metadata(
    end_id=arg_end_id,
    fields=['itemid', 'documentcollectionid2', 'languageisocode'],
    verbose=True)
output_path_index = get_path_raw(CSV_RS_INDEX)

parser = argparse.ArgumentParser()
parser.add_argument(
    'storage',
    choices=['local', 'aws'],
    help='location to take input data from and save output data to')
args = parser.parse_args()
print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUT:\t\t\t\t', basename(input_path))
print(
    'OUTPUTS:\t\t\t',
    f'{basename(output_path_cases)}, {basename(output_path_opinions)}, {basename(output_path_index)}\n'
)
storage = Storage(location=args.storage)
storage.setup_pipeline(
    output_paths=[output_path_cases, output_path_opinions, output_path_index],
    input_path=input_path)
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

is_case = False

case_counter = 0
opinion_counter = 0
datarecord = dict()

# Field names used for output csv. Field names correspond to tags of original data
import argparse

start = time.time()

output_path = DIR_RECHTSPRAAK + '.zip'

# set up storage location
parser = argparse.ArgumentParser()
parser.add_argument('storage',
                    choices=['local', 'aws'],
                    help='location to save output data to')
args = parser.parse_args()
print('\n--- PREPARATION ---\n')
print('OUTPUT DATA STORAGE:\t', args.storage)
print('OUTPUT:\t\t\t', basename(output_path))
storage = Storage(location=args.storage)
storage.setup_pipeline(output_paths=[output_path])
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

if getenv('SAMPLE_TEST') == 'TRUE':
    rs_url = getenv('URL_RS_ARCHIVE_SAMPLE')
else:
    rs_url = getenv('URL_RS_ARCHIVE')

dateTimeObj = datetime.now()
date = str(dateTimeObj.year) + '-' + str(dateTimeObj.month) + '-' + str(
    dateTimeObj.day)
# evaluate input arguments
if args.delete == 'ddb':
    # remove all items from table without deleting table itself
    ddb_client.truncate_table()

elif args.delete == 'os':
    # delete OpenSearch index
    os_client.es.indices.delete(os_client.index_name)

else:
    # process each input csv
    for input_path in input_paths:

        # prepare storage
        print(f'\n--- PREPARATION {basename(input_path)} ---\n')
        storage = Storage(location='aws')
        storage.fetch_data([input_path])
        last_updated = storage.fetch_last_updated([input_path])
        print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())
        print(f'\n--- START {basename(input_path)} ---\n')
        print(f'Processing {basename(input_path)} ...')

        # initialize row processors and counters
        if args.partial != 'os':
            ddb_rp = DynamoDBRowProcessor(input_path, ddb_client.table)
        if args.partial != 'ddb':
            os_rp = OpenSearchRowProcessor(input_path, os_client)
        case_counter = 0
        ddb_item_counter = 0
        os_item_counter = 0