class TestReferer(SafePreProcTester): def __init__(self, test_name): SafePreProcTester.__init__(self, test_name) def setUp(self): self.test_class = 'referer' def init_referer_instance(self): self.__referer = Referer() def test_validate_snp(self): self.init_test('test_validate_snp') self.init_referer_instance() self.__referer.config_file = combivep_settings.COMBIVEP_CENTRAL_TEST_CONFIGURATION_FILE self.__referer.load_config() self.assertTrue(self.__referer.validate_snp('1' , 887560 , 'A', 'C'), "Incorrect SNP validating") self.assertTrue(self.__referer.validate_snp('chr3' , 25836088, 'C', 'A'), "Incorrect SNP validating") self.assertTrue(self.__referer.validate_snp('20' , 17474690, 'T', 'G'), "Incorrect SNP validating") self.assertTrue(self.__referer.validate_snp('chrX' , 56296488, 'G', 'C'), "Incorrect SNP validating") self.assertTrue(self.__referer.validate_snp('Y' , 15581983, 'G', 'A'), "Incorrect SNP validating") self.assertFalse(self.__referer.validate_snp('chr16', 21086416, 'T', 'A'), "Incorrect SNP validating") def test_get_scores(self): self.init_test('test_get_scores') self.init_referer_instance() self.__referer.config_file = combivep_settings.COMBIVEP_CENTRAL_TEST_CONFIGURATION_FILE self.__referer.load_config() rec = self.__referer.get_scores('3', 108541778, 'T', 'C') # self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_CHROM], '3', "Incorrect LJB formatting") # self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_POS], '108541778', "Incorrect LJB formatting") # self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_REF], 'T', "Incorrect LJB formatting") # self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_ALT], 'C', "Incorrect LJB formatting") self.assertEqual(rec[combivep_settings.KEY_PHYLOP_SCORE], '0.102322', "Incorrect LJB formatting") self.assertEqual(rec[combivep_settings.KEY_SIFT_SCORE], '0.91', "Incorrect LJB formatting") self.assertEqual(rec[combivep_settings.KEY_PP2_SCORE], '0', "Incorrect LJB formatting") self.assertEqual(rec[combivep_settings.KEY_LRT_SCORE], '0.312516', "Incorrect LJB formatting") self.assertEqual(rec[combivep_settings.KEY_MT_SCORE], '0.000000', "Incorrect LJB formatting") self.assertEqual(rec[combivep_settings.KEY_GERP_SCORE], '-3.16', "Incorrect LJB formatting") def tearDown(self): self.remove_working_dir()
class DataSetManager(CombiVEPBase): def __init__(self, config_file=combivep_settings.COMBIVEP_CONFIGURATION_FILE): CombiVEPBase.__init__(self) self.referer = Referer() self.referer.config_file = config_file self.referer.load_config() self.dataset = DataSet() def __clear_data(self): self.dataset.clear() def load_data(self, file_name, file_type=combivep_settings.FILE_TYPE_VCF): if file_type == combivep_settings.FILE_TYPE_VCF: return self.__load_vcf_data(file_name) if file_type == combivep_settings.FILE_TYPE_CBV: return self.__load_cbv_data(file_name) def __load_vcf_data(self, file_name): self.__clear_data() vcf_reader = VcfReader() vcf_reader.read(file_name) for rec in vcf_reader.fetch_hash_snps(): snp_data = {combivep_settings.KEY_CHROM : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_CHROM], combivep_settings.KEY_POS : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_POS], combivep_settings.KEY_REF : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_REF], combivep_settings.KEY_ALT : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_ALT], } prediction = {combivep_settings.KEY_TARGETS : None} self.dataset.append({combivep_settings.KEY_SNP_INFO_SECTION : snp_data, combivep_settings.KEY_PREDICTION_SECTION : prediction}) def __load_cbv_data(self, file_name): self.__clear_data() cbv_reader = CbvReader() cbv_reader.read(file_name) for rec in cbv_reader.fetch_hash_snps(): snp_data = {combivep_settings.KEY_CHROM : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_CHROM], combivep_settings.KEY_POS : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_POS], combivep_settings.KEY_REF : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_REF], combivep_settings.KEY_ALT : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_ALT], } prediction = {combivep_settings.KEY_TARGETS : rec[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_CBV_TARGETS]} self.dataset.append({combivep_settings.KEY_SNP_INFO_SECTION : snp_data, combivep_settings.KEY_PREDICTION_SECTION : prediction}) def validate_data(self): #to prevent misintepret due to different version between each data point by #removing items from self.dataset if they are not exist in certain UCSC database self.dataset[:] = [item for item in self.dataset if self.referer.validate_snp(item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CHROM], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_REF], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_ALT] )] def calculate_scores(self): #get scores from LJB database for item in self.dataset: item[combivep_settings.KEY_SCORES_SECTION] = self.referer.get_scores(item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CHROM], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_REF], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_ALT] ) #remove items from self.dataset if they don't have scores self.dataset[:] = [item for item in self.dataset if item[combivep_settings.KEY_SCORES_SECTION] is not None] def partition_data(self, proportion_training_data = combivep_settings.PROPORTION_TRAINING_DATA, proportion_validation_data = combivep_settings.PROPORTION_VALIDATION_DATA, ): total_proportion = proportion_training_data + proportion_validation_data self.training_data_size = int(math.floor(len(self.dataset) * proportion_training_data / total_proportion)) self.validation_data_size = len(self.dataset) - self.training_data_size def get_training_data(self): dataset = DataSet() for i in xrange(0, self.training_data_size): dataset.append(self.dataset[i]) return dataset def get_validation_data(self): dataset = DataSet() for i in xrange(self.training_data_size, len(self.dataset)): dataset.append(self.dataset[i]) return dataset def set_shuffle_seed(self, shuffle_seed): self.dataset.set_shuffle_seed(shuffle_seed) def shuffle_data(self): self.dataset.shuffle()