def load_data(data_size, features): data = {} if data_size != 263088: data['train_x'], data['train_y'] = load_postprocessed_feature_data(features, TRAIN_SET_NAME, sample=data_size) else: data['train_x'], data['train_y'] = load_postprocessed_feature_data(features, TRAIN_SET_NAME) assert data['train_x'].shape[0] == data_size data['heldout_x'], data['heldout_y'] = load_postprocessed_feature_data(features, HELDOUT_SET_NAME) data['test_x'], data['test_y'] = load_postprocessed_feature_data(features, TEST_SET_NAME) assert data['train_x'].shape[1] == data['test_x'].shape[1] return data
def load_data(cutoff_value=5): data_path = None # cutoff_value == 5 je defaultni hodnota, cesta bude zjistena automaticky if cutoff_value in (0, 3): data_path = SETTINGS.get( 'paths', 'dataFeaturesPennPostprocessed{}'.format(cutoff_value)) features = get_features_for_set_names(('orig', )) data = {} data['train_x'], data['train_y'] = load_postprocessed_feature_data( features, TRAIN_SET_NAME, data_path=data_path) assert data['train_x'].shape[0] == DATA_SIZE data['test_x'], data['test_y'] = load_postprocessed_feature_data( features, TEST_SET_NAME, data_path=data_path) assert data['train_x'].shape[1] == data['test_x'].shape[1] return data
def load_data(data_size, cutoff_value): data_path = None # cutoff_value == 5 je defaultni hodnota, cesta bude zjistena automaticky if cutoff_value in (0, 3): data_path = SETTINGS.get('paths', 'dataFeaturesPennPostprocessed{}'.format(cutoff_value)) data = {} features = get_features_for_set_names(('orig',)) if data_size != 263088: data['train_x'], data['train_y'] = load_postprocessed_feature_data(features, TRAIN_SET_NAME, sample=data_size, data_path=data_path) else: data['train_x'], data['train_y'] = load_postprocessed_feature_data(features, TRAIN_SET_NAME, data_path=data_path) assert data['train_x'].shape[0] == data_size data['heldout_x'], data['heldout_y'] = load_postprocessed_feature_data(features, HELDOUT_SET_NAME, data_path=data_path) data['test_x'], data['test_y'] = load_postprocessed_feature_data(features, TEST_SET_NAME, data_path=data_path) assert data['train_x'].shape[1] == data['test_x'].shape[1] == data['heldout_x'].shape[1] return data
'a_words_after_np', 'a_words_before_head', 'a_words_before_np', 'b_head_proper', 'b_head_pos_simple', 'b_object_form', 'b_pos_after_head_as_list', 'b_pos_before_head_as_list', 'b_pp_object_form', 'b_postmodification_type', 'b_referent', 'b_words_after_head_as_list', 'b_words_after_np_as_list', 'b_words_before_head_as_list', 'b_words_before_np_as_list', 'c_countability_bnc', 'd_head_form_embeddings', 'e_kenlm_ggl_5_lc_nbs' ] TRAIN_SET_NAME = 'train' TEST_SET_NAME = 'heldout' TRAINING_DATA_SIZE = 20000 def softmax(z): return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True) def cost(Y, T): # Y - predicted matrix # T - target matrix return -np.multiply(T, np.log(Y)).sum() if __name__ == '__main__': data = {} data['train_x'], data['train_y'] = load_postprocessed_feature_data( FEATURES, TRAIN_SET_NAME, sample=TRAINING_DATA_SIZE) data['test_x'], data['test_y'] = load_postprocessed_feature_data( FEATURES, TEST_SET_NAME) assert data['train_x'].shape[1] == data['test_x'].shape[1]