Beispiel #1
0
    row_date = row[DATE_COLUMN]

    fix_date = [it['to'] for it in FIX_BAD_DATE if it['from'] == row_date]
    if len(fix_date) > 0:
        row_date = fix_date[0]

    try:
        row[DATE_COLUMN] = convert_to_uniform_date(row_date)
    except ValueError as e:
        # this is necessary as there are some invalid dates in the spread sheet
        logging.warning(
            'Failed to convert: %s to a uniform date, Removing it. Full error: %s' % (row_date, e)
        )
        del row[DATE_COLUMN]

    row[SOURCE_COLUMN] = 'the-marker'
    row[PROOF_COLUMN] = 'https://www.themarker.com/career/EXT-1.2577328'
    return row


def convert_to_uniform_date(raw_date):

    for format in DATE_FORMATS:
        try:
            return datetime.datetime.strptime(raw_date, format).date()
        except ValueError:
            pass
    raise ValueError("Failed to convert date using all available date formats")

process(process_row=process_row, modify_datapackage=modify_datapackage)
Beispiel #2
0
                                        'has_article_46', {}).get(
                                            'activities', [])
                            ]
                        }
                    },
                ]
            },
        ]
        row['others'] = [x for x in all_districts if x != district]

    return row


def modify_datapackage(dp, *_):
    dp['resources'][0]['schema']['fields'].extend([{
        'name': 'charts',
        'type': 'array',
        'es:itemType': 'object',
        'es:index': False
    }, {
        'name': 'others',
        'type': 'array',
        'es:index': False,
        'es:itemType': 'string',
    }])
    return dp


if __name__ == '__main__':
    process(modify_datapackage=modify_datapackage, process_row=process_row)
import os
import json
import tempfile

from datapackage_pipelines.wrapper import process


def modify_datapackage(dp, parameters, _):
    os.makedirs(parameters['out-path'], exist_ok=True)
    if dp:
        filename = os.path.join(parameters['out-path'], 'datapackage.json')
        with open(filename + '.tmp', 'w') as tmp:
            json.dump(dp, tmp)
        os.rename(filename + '.tmp', filename)
    return dp


if __name__ == '__main__':
    process(modify_datapackage=modify_datapackage)
                    return

        try:
            for k, v in row.items():
                if k in ['sensitive_order']:
                    row[k] = boolean(v)
                elif k in ['budget_code']:
                    row[k] = budget_code(v)
                elif k in ['end_date', 'order_date', 'start_date']:
                    row[k] = date(v)
                elif k in ['volume']:
                    row[k] = Decimal(v.replace(',', '') if v is not None and v != '' else 0)
                elif k in ['executed']:
                    row[k] = Decimal(v.replace(',', '') if v is not None and v != '' else 0)
                elif isinstance(v, str):
                    row[k] = v.strip()
                assert row['order_id']
            stats['good-lines'] += 1
        except Exception as e:
            stats['bad-lines'] += 1
            logging.exception('ERROR in row %d: %r', row_index, row)
            bad_rows[row['report-url']] += 1
            return
    elif resource_index == 1: # the errors
        row['report-rows'] = total_rows.get(row['report-url'])
        row['report-bad-rows'] = bad_rows.get(row['report-url'])

    return row

process(process_row=process_row)
import os
import shutil
import logging

from datapackage_pipelines.wrapper import process


def cleanup(dp, parameters, *_):
    dir_to_clean = parameters['dirs_to_clean']
    for dir_name in dir_to_clean:
        abs_path = os.path.abspath(dir_name)
        logging.info('Cleaning artifact: {}'.format(abs_path))
        try:
            shutil.rmtree(abs_path)
        except FileNotFoundError:
            logging.warning('No artifact to clean: {}'.format(abs_path))

    return dp


if __name__ == '__main__':
    process(modify_datapackage=cleanup)