def test_transform_features_all_codes(self): file_processor = FileProcessor(self.input_file_path) file_processor.transform_features(self.output_file_path) job_ids, feature_codes, features = read_test_data_as_arrays( self.input_file_path) means = features.mean(axis=0) expected_z_scores = stats.zscore( features, axis=0, ddof=1) # using sample standard deviation expected_argmaxs = np.argmax(features, axis=1) expected_maxs = np.amax(features, axis=1) expected_diffs = np.abs(expected_maxs - means[expected_argmaxs]) line_index = 0 with open(self.output_file_path) as f: next(f) for line in f: job_id, z_scores, argmax, diff = _parse_transformed_line(line) np.testing.assert_equal(job_id, job_ids[line_index]) np.testing.assert_allclose(z_scores, expected_z_scores[line_index]) np.testing.assert_equal(argmax, expected_argmaxs[line_index]) np.testing.assert_almost_equal(diff, expected_diffs[line_index]) line_index += 1
def test_transform_features_some_codes(self): selected_codes = random.sample(self.feature_codes, k=2) file_processor = FileProcessor(self.input_file_path) file_processor.transform_features(self.output_file_path, selected_codes) job_ids, feature_codes, features = read_test_data_as_arrays( self.input_file_path) selected_indexes = [] for code in selected_codes: indexes = np.where(feature_codes == code)[0].tolist() selected_indexes += indexes selected_indexes = sorted(selected_indexes) selected_job_ids = job_ids[selected_indexes] selected_features = features[selected_indexes] means = selected_features.mean(axis=0) expected_z_scores = stats.zscore( selected_features, axis=0, ddof=1) # using sample standard deviation expected_argmaxs = np.argmax(selected_features, axis=1) expected_maxs = np.amax(selected_features, axis=1) expected_diffs = np.abs(expected_maxs - means[expected_argmaxs]) line_index = 0 with open(self.output_file_path) as f: next(f) for line in f: job_id, z_scores, argmax, diff = _parse_transformed_line(line) np.testing.assert_equal(job_id, selected_job_ids[line_index]) np.testing.assert_allclose(z_scores, expected_z_scores[line_index]) np.testing.assert_equal(argmax, expected_argmaxs[line_index]) np.testing.assert_almost_equal(diff, expected_diffs[line_index]) line_index += 1 self.assertEqual(line_index, len(selected_indexes))
from pathlib import Path import os from file_processor import FileProcessor if __name__ == "__main__": current_path = Path(os.path.dirname(os.path.realpath(__file__))) input_file_path = current_path / 'resources' / 'test.tsv' out_file_path = current_path / 'resources' / 'test_proc.tsv' file_processor = FileProcessor(input_file_path) file_processor.transform_features(out_file_path) print("Results saved to ", out_file_path)