Example #1
0
                                     usecols=mapping['use_cols'])

            if mapping['drop_columns']:
                train_data.drop(mapping['drop_columns'], axis=1, inplace=True)
                gc.collect()

            if mapping['concat_type'] == 'concat':
                print('concating...')
                train_data = train_data[mask]
                gc.collect()
                fold_data = train_utils.concat_stupidly(fold_data, train_data)
                assert len(fold_data) == len(train_data)

            elif mapping['concat_type'] == 'merge':
                print('merging...')
                for i in range(2):
                    fold_data = train_utils.map_atom_info(
                        fold_data, train_data, i)

            else:
                raise KeyError()

            del train_data
            gc.collect()
        assert len(fold_data) == mask.sum()
        print('saving...')
        fold_data.to_csv(f'../../data/oof_tables/{fold}_fold_oof_tables.csv',
                         index=False)
        del fold_data
        gc.collect()
Example #2
0
        nrows = No

    if not debug:
        if os.path.isfile(result_filename):
            assert False, "Result file exists!"

        if os.path.isfile(sub_filename):
            assert False, "Submission file exists!"

    print("reading data...")
    train = pd.read_csv('../data/train.csv', nrows=nrows)
    test = pd.read_csv('../data/test.csv', nrows=nrows)
    structures = pd.read_csv('../data/structures.csv', nrows=nrows)
    sub = pd.read_csv('../data/sample_submission.csv', nrows=nrows)

    train = map_atom_info(train, structures, 0)
    train = map_atom_info(train, structures, 1)

    test = map_atom_info(test, structures, 0)
    test = map_atom_info(test, structures, 1)

    train_p_0 = train[['x_0', 'y_0', 'z_0']].values
    train_p_1 = train[['x_1', 'y_1', 'z_1']].values
    test_p_0 = test[['x_0', 'y_0', 'z_0']].values
    test_p_1 = test[['x_1', 'y_1', 'z_1']].values

    train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
    test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
    train['dist_x'] = (train['x_0'] - train['x_1'])**2
    test['dist_x'] = (test['x_0'] - test['x_1'])**2
    train['dist_y'] = (train['y_0'] - train['y_1'])**2
    if not debug:
        if os.path.isfile(result_filename):
            assert False, "Result file exists!"

    train = pd.read_csv('../../data/train.csv', nrows=nrows)
    train_feats = pd.read_csv('../../data/train_stat_features.csv',
                              nrows=nrows)

    print('assert checking...')
    for col in ['molecule_name', 'atom_index_0', 'atom_index_1']:
        assert (train[col] == train_feats[col]).sum() == len(train)

    structures = pd.read_csv('../../data/structures.csv')

    for i in range(2):
        train = map_atom_info(train, structures, i)

    train = concat_dataframes(train, train_feats)

    if debug:
        train.to_csv('../../data/debug_data/stat_merge.csv', index=False)
    train = artgor_utils.reduce_mem_usage(train)

    print("creating folds...")
    sorted_train = train.sort_values([
        "scalar_coupling_constant",
        "type",
        "dist",
    ])

    sorted_train.index = range(0, len(sorted_train))
Example #4
0
        left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
        right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

    print("mapping info about atoms...")

    debug = False
    if debug:
        train = train[:100]
        test = test[:100]
        n_estimators = 50
        n_folds = 3
    else:
        n_folds = 10
        n_estimators = 30000

    train = train_utils.map_atom_info(train, structures, 0)
    train = train_utils.map_atom_info(train, structures, 1)

    test = train_utils.map_atom_info(test, structures, 0)
    test = train_utils.map_atom_info(test, structures, 1)

    train_p_0 = train[['x_0', 'y_0', 'z_0']].values
    train_p_1 = train[['x_1', 'y_1', 'z_1']].values
    test_p_0 = test[['x_0', 'y_0', 'z_0']].values
    test_p_1 = test[['x_1', 'y_1', 'z_1']].values

    train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
    test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
    train['dist_x'] = (train['x_0'] - train['x_1'])**2
    test['dist_x'] = (test['x_0'] - test['x_1'])**2
    train['dist_y'] = (train['y_0'] - train['y_1'])**2
    if debug:
        nrows = 1000
        n_folds = 3
        n_estimators = 50
        result_filename = None
    else:
        nrows = 2 * 10 ** 6
        n_folds = 5
        n_estimators = 2000
        result_filename = '../../results/feature_selection/acsf_features_result.csv'

    train = pd.read_csv('../../data/train.csv', nrows=nrows)
    acsf_structures = pd.read_csv('../../data/structure_with_acsf.csv', index_col=0)

    for i in range(2):
        train = map_atom_info(train, acsf_structures, i)

    train_p_0 = train[['x_0', 'y_0', 'z_0']].values
    train_p_1 = train[['x_1', 'y_1', 'z_1']].values
    train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)

    if debug:
        train.to_csv('../../data/debug_data/acsf_merge.csv', index=False)

    train = artgor_utils.reduce_mem_usage(train)

    print("creating folds...")
    sorted_train = train.sort_values([
        "scalar_coupling_constant",
        "type",
        "dist",