def upload_row(self, row):
        item_counter = 0
        # retrieve lists of items to put and update
        put_items, update_items, update_set_items = self.row_processor(row)

        # add items
        for item in put_items:
            try:
                self.table.put_item(Item=item)
                item_counter += 1
            except Exception as e:
                print(e, item[self.pk], item[self.sk])
                with open(get_path_processed(CSV_DDB_ECLIS_FAILED), 'a') as f:
                    f.write(item[self.pk] + '\n')

        # update item attributes
        for item in update_items:
            try:
                pk_val, sk_val, expression_att_names, expression_att_values = self._extract_attributes(item)
                update_expression = 'SET ' + ', '.join(list(expression_att_names.keys())[i] + '=' +
                                                       list(expression_att_values.keys())[i]
                                                       for i in range(len(expression_att_names)))
                self.table.update_item(
                    Key={self.pk: pk_val, self.sk: sk_val},
                    UpdateExpression=update_expression,
                    ExpressionAttributeNames=expression_att_names,
                    ExpressionAttributeValues=expression_att_values
                )
            except Exception as e:
                print(e, item[self.pk], item[self.sk])
                with open(get_path_processed(CSV_DDB_ECLIS_FAILED), 'a') as f:
                    f.write(item[self.pk] + '\n')

        # update item set attributes
        for item in update_set_items:
            try:
                pk_val, sk_val, expression_att_names, expression_att_values = self._extract_attributes(item)
                update_expression = f'ADD {list(expression_att_names.keys())[0]} {list(expression_att_values.keys())[0]}'
                self.table.update_item(
                    Key={self.pk: pk_val, self.sk: sk_val},
                    UpdateExpression=update_expression,
                    ExpressionAttributeNames=expression_att_names,
                    ExpressionAttributeValues={
                        list(expression_att_values.keys())[0]: list(expression_att_values.values())[0]
                    }
                )
            except Exception as e:
                print(e, item[self.pk], item[self.sk])
                with open(get_path_processed(CSV_DDB_ECLIS_FAILED), 'a') as f:
                    f.write(item[self.pk] + '\n')

        return item_counter
Exemple #2
0
    def upload_row(self, row):
        item_counter = 0

        put_items, update_items, update_set_items = self.row_processor(row)

        for item in put_items:
            try:
                response = self.client.es.index(
                    index=self.client.index_name,
                    doc_type='_doc',
                    id=item[ECLI],
                    body=item,
                    filter_path='_shards.failed'
                )
                item_counter += 1
                if response['_shards']['failed'] != 0:
                    raise Exception('Shard indexing failed.')
            except Exception as e:
                print(e, item[ECLI])
                with open(get_path_processed(CSV_OS_ECLIS_FAILED), 'a') as f:
                    f.write(item[ECLI] + '\n')

        # add/overwrite attributes to document. If document does not yet exist, create new document.
        for item in update_items:
            item_id = item[ECLI]
            item.pop(ECLI)
            try:
                response = self.client.es.update(
                    index=self.client.index_name,
                    doc_type='_doc',
                    id=item_id,
                    body={
                        'doc': item,
                        'upsert': {
                            ECLI: item_id,
                            **item
                        }
                    },
                    filter_path='_shards.failed'
                )
                if response['_shards']['failed'] != 0:
                    raise Exception('Shard indexing failed.')
            except Exception as e:
                print(e, item_id)
                with open(get_path_processed(CSV_OS_ECLIS_FAILED), 'a') as f:
                    f.write(item_id + '\n')

        # add member to set attribute of document. If document does not yet exist, create new document.
        for item in update_set_items:
            item_id = item[ECLI]
            item.pop(ECLI)
            for attribute in item.keys():
                try:
                    response = self.client.es.update(
                        index=self.client.index_name,
                        doc_type='_doc',
                        id=item_id,
                        body={
                            'script': {
                                'source': f'if (ctx._source.{attribute} != null) '
                                          f'{{if (ctx._source.{attribute}.contains(params.{attribute})) '
                                          f'{{ctx.op = "none"}} else {{ctx._source.{attribute}.add(params.{attribute})}}}}'
                                          f'else {{ctx._source.{attribute} = [params.{attribute}]}}',
                                'params': {
                                    attribute: item[attribute]
                                }
                            },
                            'upsert': {
                                ECLI: item_id,
                                attribute: [item[attribute]]
                            }
                        },
                        filter_path='_shards.failed'
                    )
                    if response['_shards']['failed'] != 0:
                        raise Exception('Shard indexing failed.')
                except Exception as e:
                    print(e, item_id, item)
                    with open(get_path_processed(CSV_OS_ECLIS_FAILED), 'a') as f:
                        f.write(item_id + '\n')

        return item_counter
Exemple #3
0
    def _get_row_processor(self):
        def row_processor_rs_cases(row):
            update_items = []
            update_set_items = []
            # transform set attributes  to lists
            # (domains, predecessor_successor_cases, references_legislation, alternative_sources)
            for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT, RS_HASVERSION]:
                if attribute in row:
                    row[attribute] = row[attribute].split(SET_SEP)
            put_items = [row]
            return put_items, update_items, update_set_items

        def row_processor_rs_opinions(row):
            put_items, update_items, update_set_items = row_processor_rs_cases(row)
            if ECLI_DECISION in row:
                update_items.append({
                    ECLI: row[ECLI_DECISION],
                    ECLI_OPINION: row[ECLI]
                })
            return put_items, update_items, update_set_items

        def row_processor_li_cases(row):
            put_items = []
            update_set_items = []
            if LI_LAW_AREA in row:
                row[LI_LAW_AREA] = row[LI_LAW_AREA].split(SET_SEP)
            row_li = {ECLI: row[ECLI]}
            for key in row.keys() - ECLI:
                row_li[key + LI] = row[key]
            update_items = [row_li]
            return put_items, update_items, update_set_items

        # @TODO: replace attribute names with global definition
        def row_processor_c_citations(row):
            put_items = []
            update_items = []
            update_set_items = []
            if row['keep1'] == 'True':
                update_set_items = [{
                    ECLI: row[ECLI],
                    'cites': row[LIDO_JURISPRUDENTIE]
                }, {
                    ECLI: row[LIDO_JURISPRUDENTIE],
                    'cited_by': row[ECLI]
                }]
            return put_items, update_items, update_set_items

        def row_processor_l_citations(row):
            put_items = []
            update_items = []
            update_set_items = [{
                ECLI: row[ECLI],
                'legal_provisions': row[LIDO_ARTIKEL_TITLE]
            }]
            return put_items, update_items, update_set_items

        processor_map = {
            get_path_processed(CSV_RS_CASES): row_processor_rs_cases,
            get_path_processed(CSV_RS_OPINIONS): row_processor_rs_opinions,
            get_path_processed(CSV_LI_CASES): row_processor_li_cases,
            get_path_raw(CSV_CASE_CITATIONS): row_processor_c_citations,
            get_path_raw(CSV_LEGISLATION_CITATIONS): row_processor_l_citations
        }
        return processor_map.get(self.path)
    def _get_row_processor(self):
        def row_processor_rs_cases(row):
            """
            turns csv row (1 RS case) into item(s) for DynamoDB table according to this schema
            :param row: dict representation of csv row with RS case attributes
            :return: list of dict representation of items in schema format
            """
            put_items = []
            update_set_items = []
            # split set attributes (domain, case citations, legislation citations)
            if RS_SUBJECT in row:
                for val in row[RS_SUBJECT].split(SET_SEP):
                    put_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DOM.value + KEY_SEP + val,
                        key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                        RS_SUBJECT[:-1]: val
                    })
            for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT]:
                if attribute in row:
                    update_set_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DATA.value,
                        attribute: set(row[attribute].split(SET_SEP))
                    })
                    row.pop(attribute)
            put_items.append({
                self.sk: ItemType.DATA.value,
                key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                **row
            })
            return put_items, [], update_set_items

        def row_processor_rs_opinions(row):
            put_items = []
            update_items = []
            update_set_items = []
            if RS_SUBJECT in row:
                for val in row[RS_SUBJECT].split(SET_SEP):
                    put_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DOM.value + KEY_SEP + val,
                        key_sdd: DataSource.RS.value + KEY_SEP + DocType.OPI.value + KEY_SEP + row[RS_DATE],
                        RS_SUBJECT[:-1]: val
                    })
            # split set attributes (domain, case citations, legislation citations)
            for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT]:
                if attribute in row:
                    update_set_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DATA.value,
                        attribute: set(row[attribute].split(SET_SEP))
                    })
                    row.pop(attribute)
            put_items.append({
                self.sk: ItemType.DATA.value,
                key_sdd: DataSource.RS.value + KEY_SEP + DocType.OPI.value + KEY_SEP + row[RS_DATE],
                **row
            })
            if ECLI_DECISION in row:
                update_items.append({
                    self.pk: row[ECLI_DECISION],
                    self.sk: ItemType.DATA.value,
                    ECLI_OPINION: row[ECLI]
                })
            return put_items, update_items, update_set_items

        def row_processor_li_cases(row):
            put_items = []
            update_items = []
            update_set_items = []
            row_li = dict()
            for key in row.keys() - ECLI:
                row_li[key + LI] = row[key]
            if LI_LAW_AREA in row:
                for val in row[LI_LAW_AREA].split(SET_SEP):
                    put_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DOM_LI.value + KEY_SEP + val,
                        key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                        LI_LAW_AREA[:-1] + LI: val
                    })
                update_set_items.append({
                    self.pk: row[ECLI],
                    self.sk: ItemType.DATA.value,
                    LI_LAW_AREA + LI: set(row[LI_LAW_AREA].split(SET_SEP))
                })
                row_li.pop(LI_LAW_AREA + LI)
            update_items.append({
                self.pk: row[ECLI],
                self.sk: ItemType.DATA.value,
                key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                **row_li
            })
            return put_items, update_items, update_set_items

        # @TODO: replace attribute names with global definition
        def row_processor_c_citations(row):
            update_set_items = []
            if row['keep1'] == 'True':
                update_set_items = [{
                    self.pk: row[ECLI],
                    self.sk: ItemType.DATA.value,
                    'cites': {row[LIDO_JURISPRUDENTIE]}
                }, {
                    self.pk: row[LIDO_JURISPRUDENTIE],
                    self.sk: ItemType.DATA.value,
                    'cited_by': {row[ECLI]}
                }]
            return [], [], update_set_items

        def row_processor_l_citations(row):
            update_set_items = [{
                self.pk: row[ECLI],
                self.sk: ItemType.DATA.value,
                'legal_provisions': {row[LIDO_ARTIKEL_TITLE]}
            }]
            return [], [], update_set_items

        processor_map = {
            get_path_processed(CSV_RS_CASES): row_processor_rs_cases,
            get_path_processed(CSV_RS_OPINIONS): row_processor_rs_opinions,
            get_path_processed(CSV_LI_CASES): row_processor_li_cases,
            get_path_raw(CSV_CASE_CITATIONS): row_processor_c_citations,
            get_path_raw(CSV_LEGISLATION_CITATIONS): row_processor_l_citations
        }
        return processor_map.get(self.path)
    get_path_raw(CSV_RS_CASES),
    get_path_raw(CSV_RS_OPINIONS),
    get_path_raw(CSV_LI_CASES)
]

parser = argparse.ArgumentParser()
parser.add_argument(
    'storage',
    choices=['local', 'aws'],
    help='location to take input data from and save output data to')
args = parser.parse_args()

print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUTS:\t\t\t\t', [basename(input_path) for input_path in input_paths])
print('OUTPUTS:\t\t\t', [
    basename(get_path_processed(basename(input_path)))
    for input_path in input_paths
], '\n')

# run data transformation for each input file
for input_path in input_paths:
    file_name = basename(input_path)
    output_path = get_path_processed(file_name)
    print(f'\n--- PREPARATION {file_name} ---\n')
    storage = Storage(location=args.storage)
    storage.setup_pipeline(output_paths=[output_path], input_path=input_path)
    last_updated = storage.pipeline_last_updated
    print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

    print(f'\n--- START {file_name} ---\n')
from csv import DictReader
import csv
from data_loading.row_processors.dynamodb import DynamoDBRowProcessor
from data_loading.row_processors.opensearch import OpenSearchRowProcessor
from data_loading.clients.dynamodb import DynamoDBClient
from data_loading.clients.opensearch import OpenSearchClient
from definitions.storage_handler import Storage, CSV_RS_CASES, CSV_LI_CASES, CSV_RS_OPINIONS, CSV_CASE_CITATIONS, \
    CSV_LEGISLATION_CITATIONS, get_path_processed, get_path_raw
import time
import argparse
csv.field_size_limit(sys.maxsize)

start = time.time()

input_paths = [
    get_path_processed(CSV_RS_CASES),
    get_path_processed(CSV_RS_OPINIONS),
    get_path_processed(CSV_LI_CASES),
    get_path_raw(CSV_CASE_CITATIONS),
    get_path_raw(CSV_LEGISLATION_CITATIONS)
]

# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument(
    '-partial',
    '--partial',
    choices=['ddb', 'os'],
    help='load data only to either DynamoDB or OpenSearch, not both')
parser.add_argument('-delete',
                    '--delete',