def test_transform_features_all_codes(self):
        file_processor = FileProcessor(self.input_file_path)
        file_processor.transform_features(self.output_file_path)

        job_ids, feature_codes, features = read_test_data_as_arrays(
            self.input_file_path)

        means = features.mean(axis=0)

        expected_z_scores = stats.zscore(
            features, axis=0, ddof=1)  # using sample standard deviation
        expected_argmaxs = np.argmax(features, axis=1)
        expected_maxs = np.amax(features, axis=1)
        expected_diffs = np.abs(expected_maxs - means[expected_argmaxs])

        line_index = 0

        with open(self.output_file_path) as f:
            next(f)

            for line in f:
                job_id, z_scores, argmax, diff = _parse_transformed_line(line)

                np.testing.assert_equal(job_id, job_ids[line_index])
                np.testing.assert_allclose(z_scores,
                                           expected_z_scores[line_index])
                np.testing.assert_equal(argmax, expected_argmaxs[line_index])
                np.testing.assert_almost_equal(diff,
                                               expected_diffs[line_index])

                line_index += 1
    def test_transform_features_some_codes(self):
        selected_codes = random.sample(self.feature_codes, k=2)

        file_processor = FileProcessor(self.input_file_path)
        file_processor.transform_features(self.output_file_path,
                                          selected_codes)

        job_ids, feature_codes, features = read_test_data_as_arrays(
            self.input_file_path)

        selected_indexes = []

        for code in selected_codes:
            indexes = np.where(feature_codes == code)[0].tolist()
            selected_indexes += indexes

        selected_indexes = sorted(selected_indexes)

        selected_job_ids = job_ids[selected_indexes]
        selected_features = features[selected_indexes]

        means = selected_features.mean(axis=0)

        expected_z_scores = stats.zscore(
            selected_features, axis=0,
            ddof=1)  # using sample standard deviation
        expected_argmaxs = np.argmax(selected_features, axis=1)
        expected_maxs = np.amax(selected_features, axis=1)
        expected_diffs = np.abs(expected_maxs - means[expected_argmaxs])

        line_index = 0

        with open(self.output_file_path) as f:
            next(f)

            for line in f:
                job_id, z_scores, argmax, diff = _parse_transformed_line(line)

                np.testing.assert_equal(job_id, selected_job_ids[line_index])
                np.testing.assert_allclose(z_scores,
                                           expected_z_scores[line_index])
                np.testing.assert_equal(argmax, expected_argmaxs[line_index])
                np.testing.assert_almost_equal(diff,
                                               expected_diffs[line_index])

                line_index += 1

            self.assertEqual(line_index, len(selected_indexes))
Beispiel #3
0
from pathlib import Path
import os

from file_processor import FileProcessor

if __name__ == "__main__":
    current_path = Path(os.path.dirname(os.path.realpath(__file__)))

    input_file_path = current_path / 'resources' / 'test.tsv'
    out_file_path = current_path / 'resources' / 'test_proc.tsv'

    file_processor = FileProcessor(input_file_path)
    file_processor.transform_features(out_file_path)

    print("Results saved to ", out_file_path)