Ejemplo n.º 1
0
    def test_predictor(self):
        """

        to see if it can correctly predict SNPs in feature-vector format

        """
        self.individual_debug = True
        self.init_test("test_predictor")
        self.init_predictor_instance()

        predictor = Predictor()
        test_dataset = DataSet(os.path.join(combivep_settings.COMBIVEP_CENTRAL_TEST_DATASET_DIR, "test_dataset"))
        params_file = os.path.join(self.data_dir, "params.npz")
        predictor.import_parameters(params_file=params_file)
        out = predictor.predict(test_dataset)
        self.assertEqual(round(out[0][0], 4), 0.2729, msg="Predictor does not functional properly")
Ejemplo n.º 2
0
def predict_deleterious_probability(SNPs_file,
                                    params_file=combivep_settings.USER_PARAMETERS_FILE,
                                    file_type=combivep_settings.FILE_TYPE_VCF,
                                    output_file=None,
                                    config_file=combivep_settings.COMBIVEP_CONFIGURATION_FILE,
                                    ):
    """

    CBV (CombiVEP format) is a parsed format intended to be used by CombiVEP.
    CBV has 5 fields, CHROM, POS, REF, ALT, EFFECT (1=deleterious, 0=neutral). All are tab separated
    Required arguments
    - SNPs_file : list of SNPs to be predicted, can be either VCF or CBV (default is VCF)

    """
    #pre-processing test dataset
    print >> sys.stderr, 'pre-processing dataset, this may take a while (around 750 SNPs/mins). . . . '
    dm = DataSetManager(config_file=config_file)
    dm.load_data(SNPs_file, file_type=file_type)
    dm.validate_data()
    dm.calculate_scores()

    #predict
    predictor = Predictor()
    predictor.import_parameters(params_file=params_file)
    out = (np.array(predictor.predict(dm.dataset)).reshape(-1,))

    #print output
    if output_file is not None:
        sys.stdout = open(output_file, 'w')
    print >> sys.stdout, "#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("CHROM", "POS", "REF", "ALT", "ACTUAL_DELETERIOUS_EFFECT", "PREDICTED_DELETERIOUS_PROBABILITY", "PHYLOP_SCORE", "SIFT_SCORE", "PP2_SCORE", "LRT_SCORT", "MT_SCORE", "GERP_SCORE")
    for i in xrange(len(dm.dataset)):
        print >> sys.stdout, "%s\t%s\t%s\t%s\t%s\t%6.4f\t%s\t%s\t%s\t%s\t%s\t%s" % (dm.dataset[i][combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CHROM],
                                                        dm.dataset[i][combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS],
                                                        dm.dataset[i][combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_REF],
                                                        dm.dataset[i][combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_ALT],
                                                        dm.dataset[i][combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS],
                                                        out[i],
                                                        dm.dataset[i][combivep_settings.KEY_SCORES_SECTION][combivep_settings.KEY_PHYLOP_SCORE],
                                                        dm.dataset[i][combivep_settings.KEY_SCORES_SECTION][combivep_settings.KEY_SIFT_SCORE],
                                                        dm.dataset[i][combivep_settings.KEY_SCORES_SECTION][combivep_settings.KEY_PP2_SCORE],
                                                        dm.dataset[i][combivep_settings.KEY_SCORES_SECTION][combivep_settings.KEY_LRT_SCORE],
                                                        dm.dataset[i][combivep_settings.KEY_SCORES_SECTION][combivep_settings.KEY_MT_SCORE],
                                                        dm.dataset[i][combivep_settings.KEY_SCORES_SECTION][combivep_settings.KEY_GERP_SCORE],
                                                        )
    sys.stdout = sys.__stdout__
Ejemplo n.º 3
0
    def test_predictor(self):
        """

        to see if it can correctly predict SNPs in feature-vector format

        """
        self.individual_debug = True
        self.init_test('test_predictor')
        self.init_predictor_instance()

        predictor = Predictor()
        test_data = DataSet(os.path.join(cbv_const.CBV_SAMPLE_DATASET_DIR,
                                         'test_dataset'))
        params_file = os.path.join(self.data_dir,
                                   'params.npz')
        predictor.import_parameters(params_file=params_file)
        out = predictor.predict(test_data)
        self.assertEqual(round(out[0][0], 4),
                         0.2729,
                         msg='Predictor does not functional properly')
Ejemplo n.º 4
0
def fast_predict(SNPs_file,
                 params_file=cbv_const.USER_PARAMS_FILE,
                 file_type=cbv_const.FILE_TYPE_VCF,
                 output_file=None,
                 cfg_file=cbv_const.CBV_CFG_FILE,
                 ):
    """

    CBV (CombiVEP format) is a parsed format intended to be used by CombiVEP.
    CBV has 5 fields, CHROM, POS, REF, ALT, EFFECT (1=deleterious, 0=neutr).
    All are tab separated
    Required arguments
    - SNPs_file : list of SNPs to be predicted, can be either VCF or CBV
                  (default is VCF)

    """
    #pre-processing test dataset
    info('pre-processing dataset, this may take a while (around 750 SNPs/mins). . .')
    dm = FastDataSetManager(cfg_file=cfg_file)
    dm.load_data(SNPs_file, file_type=dev_const.FILE_TYPE_SCORES)

    #predict
    predictor = Predictor()
    predictor.import_parameters(params_file=params_file)
    out = (np.array(predictor.predict(dm.dataset)).reshape(-1,))

    #print output
    if output_file is not None:
        sys.stdout = open(output_file, 'w')
    tmp_rec = []
    tmp_rec.append("CHROM")
    tmp_rec.append("POS")
    tmp_rec.append("REF")
    tmp_rec.append("ALT")
    tmp_rec.append("ACTUAL_DELETERIOUS_EFFECT")
    tmp_rec.append("PREDICTED_DELETERIOUS_PROBABILITY")
    tmp_rec.append("PHYLOP_SCORE")
    tmp_rec.append("SIFT_SCORE")
    tmp_rec.append("PP2_SCORE")
    tmp_rec.append("LRT_SCORT")
    tmp_rec.append("MT_SCORE")
    tmp_rec.append("GERP_SCORE")
    print "#" + "\t".join(tmp_rec)
    for i in xrange(len(dm.dataset)):
        del tmp_rec[:]
        snp_data = dm.dataset[i][cbv_const.KW_SNP_DATA]
        scores   = dm.dataset[i][cbv_const.KW_SCORES]
        tmp_rec.append(snp_data.chrom)
        tmp_rec.append(snp_data.pos)
        tmp_rec.append(snp_data.ref)
        tmp_rec.append(snp_data.alt)
        tmp_rec.append(snp_data.target)
        tmp_rec.append("%6.4f" % out[i])
        tmp_rec.append(scores.phylop_score)
        tmp_rec.append(scores.sift_score)
        tmp_rec.append(scores.pp2_score)
        tmp_rec.append(scores.lrt_score)
        tmp_rec.append(scores.mt_score)
        tmp_rec.append(scores.gerp_score)
        print "\t".join(tmp_rec)
    sys.stdout = sys.__stdout__