Exemple #1
0
    def test_process_non_orgs_calls_generate_composite_key_correctly(
            self, mocked_iso_code, mocked_comp_key, location_table):
        mocked_iso_code.side_effect = ['One', 'Two', 'Three']
        mocked_comp_key.side_effect = [
            'london_one', 'paris_two', 'new-york_three'
        ]

        expected_calls = [
            mock.call(city='London', country='One'),
            mock.call(city='Paris', country='Two'),
            mock.call(city='New York', country='Three')
        ]

        process_non_orgs(location_table, set(), ['id'])
        assert mocked_comp_key.mock_calls == expected_calls
Exemple #2
0
    def test_process_non_orgs_inserts_location_comp_key(
            self, mocked_iso_code, mocked_comp_key, location_table):
        mocked_iso_code.side_effect = ['One', 'Two', 'Three']
        mocked_comp_key.side_effect = [
            'london_one', 'paris_two', 'new-york_three'
        ]

        expected_result = [{
            'id': '111',
            'city': 'London',
            'country': 'One',
            'location_id': 'london_one'
        }, {
            'id': '222',
            'city': 'Paris',
            'country': 'Two',
            'location_id': 'paris_two'
        }, {
            'id': '333',
            'city': 'New York',
            'country': 'Three',
            'location_id': 'new-york_three'
        }]

        assert process_non_orgs(location_table, set(),
                                ['id']) == expected_result
Exemple #3
0
    def test_process_non_orgs_drops_existing_rows_with_multiple_primary_keys(
            self, valid_table):
        existing = {('111', 'cat'), ('222', 'dog')}
        pks = ['id', 'other']
        expected_result = [{'id': '333', 'other': 'frog'}]

        assert process_non_orgs(valid_table, existing, pks) == expected_result
Exemple #4
0
    def test_process_non_orgs_changes_country_code_column_name(
            self, mocked_iso_code, mocked_comp_key, location_table):
        mocked_iso_code.side_effect = ['One', 'Two', 'Three']
        mocked_comp_key.side_effect = ValueError

        keys = {
            k
            for k, _ in process_non_orgs(location_table, set(), ['id'])
            [0].items()
        }

        assert 'country_code' not in keys
        assert 'country' in keys
Exemple #5
0
    def test_process_non_orgs_renames_uuid_columns(self, valid_table):
        expected_result = [{
            'id': '111',
            'other': 'cat'
        }, {
            'id': '222',
            'other': 'dog'
        }, {
            'id': '333',
            'other': 'frog'
        }]

        assert process_non_orgs(valid_table, set(), ['id']) == expected_result
Exemple #6
0
    def test_process_non_orgs_changes_nans_to_none(self):
        df = pd.DataFrame({
            'uuid': ['111', '222', '333'],
            'other': [pd.np.nan, 'dog', None]
        })

        expected_result = [{
            'id': '111',
            'other': None
        }, {
            'id': '222',
            'other': 'dog'
        }, {
            'id': '333',
            'other': None
        }]

        assert process_non_orgs(df, set(), ['id']) == expected_result
Exemple #7
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    table = os.environ["BATCHPAR_table"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])
    s3_path = os.environ["BATCHPAR_outinfo"]

    logging.warning(f"Processing {table} file")

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)
    table_name = f"crunchbase_{table}"
    table_class = get_class_by_tablename(Base, table_name)

    # collect file
    nrows = 1000 if test else None
    df = get_files_from_tar([table], nrows=nrows)[0]
    logging.warning(f"{len(df)} rows in file")

    # get primary key fields and set of all those already existing in the db
    pk_cols = list(table_class.__table__.primary_key.columns)
    pk_names = [pk.name for pk in pk_cols]
    with db_session(engine) as session:
        existing_rows = set(session.query(*pk_cols).all())

    # process and insert data
    processed_rows = process_non_orgs(df, existing_rows, pk_names)
    for batch in split_batches(processed_rows, batch_size):
        insert_data("BATCHPAR_config",
                    'mysqldb',
                    db_name,
                    Base,
                    table_class,
                    processed_rows,
                    low_memory=True)

    logging.warning(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.warning("Batch job complete.")
Exemple #8
0
    def test_process_non_orgs_converts_boolean_columns(self,
                                                       mocked_bool_convert):
        df = pd.DataFrame({
            'uuid': ['111', '222', '333'],
            'is_cool': ['t', 'f', 'bar']
        })

        mocked_bool_convert.side_effect = [True, False, None]

        expected_result = [{
            'id': '111',
            'is_cool': True
        }, {
            'id': '222',
            'is_cool': False
        }, {
            'id': '333',
            'is_cool': None
        }]

        assert process_non_orgs(df, set(), ['id']) == expected_result
Exemple #9
0
    def test_process_non_orgs_inserts_none_when_location_id_fails(
            self, mocked_iso_code, mocked_comp_key, location_table):
        mocked_iso_code.side_effect = ['One', 'Two', 'Three']
        mocked_comp_key.side_effect = ValueError

        expected_result = [{
            'id': '111',
            'city': 'London',
            'country': 'One',
            'location_id': None
        }, {
            'id': '222',
            'city': 'Paris',
            'country': 'Two',
            'location_id': None
        }, {
            'id': '333',
            'city': 'New York',
            'country': 'Three',
            'location_id': None
        }]

        assert process_non_orgs(location_table, set(),
                                ['id']) == expected_result