def test_arbitrary_rankings(self):
        """test that _select_best_with_multiple_criteria can sort with mutliple criteria and get the right answer"""
        a = (0,1)
        b = (1,1)
        c = (1,2)
        d = (2,1)
        e = (2,2)
        f = (0,4)
        g = (-1,5)
        input = [a,b,c,d,e,f,g]
        #sort by left minimum, right minimum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], min),(lambda x: x[1], min)])
        self.assertEqual(result[0], g)

        #sort by right minimum, left minimum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input,[(lambda x: x[1], min),(lambda x: x[0], min)])
        self.assertEqual(result[0], a)

        #sort by left maximum, right minimum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], max),(lambda x: x[0], min)])
        self.assertEqual(result[0], d)

        #sort by sum, then right maximum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input,[(sum, max), (lambda x: x[1],max)])
        self.assertEqual(result[0], g)
    def test_arbitrary_rankings(self):
        """test that _select_best_with_multiple_criteria can sort with mutliple criteria and get the right answer"""
        a = (0,1)
        b = (1,1)
        c = (1,2)
        d = (2,1)
        e = (2,2)
        f = (0,4)
        g = (-1,5)
        input = [a,b,c,d,e,f,g]
        #sort by left minimum, right minimum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], min),(lambda x: x[1], min)])
        self.assertEqual(result[0], g)

        #sort by right minimum, left minimum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input,[(lambda x: x[1], min),(lambda x: x[0], min)])
        self.assertEqual(result[0], a)

        #sort by left maximum, right minimum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], max),(lambda x: x[0], min)])
        self.assertEqual(result[0], d)

        #sort by sum, then right maximum
        result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input,[(sum, max), (lambda x: x[1],max)])
        self.assertEqual(result[0], g)
    def test_intitialize(self):
        """Test a simple initialization of an ensembl datasource """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)
        self.assertIsNotNone(ensembl_ds)
        ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
    def test_intitialize(self):
        """Test a simple initialization of an ensembl datasource """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)
        self.assertIsNotNone(ensembl_ds)
        ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
Example #5
0
 def _create_test_gencode_ds(base_output_filename, protein_id_mapping_file,
                             gencode_version):
     genes = [
         "MAPK1", "MUC16", "PIK3CA", "YPEL1", "KRTAP4-7", "MAT2A",
         "DDX11L10"
     ]
     gtf_list = []
     fasta_list = []
     for gene in genes:
         gtf_list.append("testdata/gencode/" + gene + ".gencode.v" +
                         str(gencode_version) + ".annotation.gtf")
         fasta_list.append("testdata/gencode/" + gene + ".gencode.v" +
                           str(gencode_version) + ".pc_transcripts.fa")
     shutil.rmtree(base_output_filename + ".transcript.idx",
                   ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gene.idx",
                   ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx",
                   ignore_errors=True)
     genome_build_factory = GenomeBuildFactory()
     genome_build_factory.construct_ensembl_indices(
         gtf_list,
         fasta_list,
         base_output_filename,
         protein_id_mapping_file=protein_id_mapping_file)
     ensembl_ds = EnsemblTranscriptDatasource(base_output_filename,
                                              title="GENCODE",
                                              version="v" +
                                              str(gencode_version),
                                              tx_filter="basic")
     return ensembl_ds
Example #6
0
    def createTranscriptProviderDatasource(config,
                                           tx_mode="CANONICAL",
                                           protocol="file"):
        """ Creates a GENCODE or Gaf 3.0 datasource from a config file.  Determines which is available automatically,
            For GAF 3.0, assumes a gaf3.0 section with keys: gaf_fname and gaf_transcript_seqs_fname

            """
        if os.path.exists(config.get("gencode", "gencodeDir")):
            gencode_dir = config.get("gencode", "gencodeDir")
            result_ds = EnsemblTranscriptDatasource(
                gencode_dir + "/gencode.v19.annotation.gtf",
                title="GENCODE",
                version="TEST v19",
                tx_filter="basic",
                tx_mode=tx_mode)
        else:
            try:
                gaf_fname = config.get("gaf3.0", "gaf_fname")
                gaf_transcripts_fname = config.get(
                    "gaf3.0", "gaf_transcript_seqs_fname")
                result_ds = Gaf(gaf_fname,
                                gaf_transcripts_fname,
                                tx_mode=tx_mode,
                                protocol=protocol)
            except Exception as gaf_failure_reason:
                raise Exception(
                    "Couldn't create a transcript provider datasource, no gencode dir found and %s"
                    % gaf_failure_reason)
        return result_ds
    def test_convert_genomic_space_to_exon_space(self, loc, gt_d):
        """Test genomic --> exon transform on real data. """
        gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa"
        base_output_filename = "out/test_variant_classification"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
        ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST")
        tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790")

        start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0])
        loc_length = (int(loc[1]) - int(loc[0]))
        self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length))
        self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + "   exons: " + str(tx[0].get_exons()))
    def test_simple_annotate(self):
        """ Annotate a simple example.
        """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)

        m = MutationData()
        m.chr = "22"
        m.start = "22161963"
        m.end = "22161963"
        m.ref_allele = "C"
        m.alt_allele = "A"

        m2 = ensembl_ds.annotate_mutation(m)
    def test_simple_annotate(self):
        """ Annotate a simple example.
        """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)

        m = MutationData()
        m.chr = "22"
        m.start = "22161963"
        m.end = "22161963"
        m.ref_allele = "C"
        m.alt_allele = "A"

        m2 = ensembl_ds.annotate_mutation(m)
Example #10
0
 def test_tie_breaking_rankings(self):
     """test that _select_best_with_multiple_criteria works with ties"""
     a = (0, 0, 1)
     b = (0, 0, 2)
     c = (0, 0, 3)
     input = [a, b, c]
     result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(
         input, [(lambda x: x[0], max), (lambda x: 3, min),
                 (lambda x: x[1], max), (lambda x: x[2], max)])
     self.assertEqual(result[0], c)
 def _create_ensembl_ds_from_saccer(self):
     gencode_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
     gencode_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
     base_output_filename = "out/test_saccer_ds"
     shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
     genome_build_factory = GenomeBuildFactory()
     genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
     ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="ensembl", version="71")
     return ensembl_ds
 def test_tie_breaking_rankings(self):
     """test that _select_best_with_multiple_criteria works with ties"""
     a = (0,0,1)
     b = (0,0,2)
     c = (0,0,3)
     input =[a,b,c]
     result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], max),
                                                                             (lambda x: 3,min),
                                                                             (lambda x: x[1], max),
                                                                             (lambda x: x[2],max)])
     self.assertEqual(result[0],c)
Example #13
0
    def createDatasourceFromConfigParser(configParser, leafDir):
        """
        configParser -- config parser instance from the config file in the leafdir. For information on config file format/conventions see (TODO)
        
        leafDir -- contains the file and necessary files (post indexing and install steps) to instantiate a datasource.

        """
        result = None
        # Determine the type
        dsType = configParser.get("general", "type")
        
        # TODO: Replace these if statements with something a bit more robust, such as a proper dependency injection framework
        filePrefix = leafDir + "/"
        if dsType == "gaf":
            gaf_fname = filePrefix + configParser.get('general', 'gaf_fname')
            gaf_transcript_sequences_fname = filePrefix + configParser.get('general', 'gaf_transcript_seqs_fname')
            result = Gaf(gaf_fname, gaf_transcript_sequences_fname, title=configParser.get("general", "title"), version=configParser.get("general", "version"), protocol=configParser.get("general", "protocol"))
        elif dsType == "dbsnp":
            result = dbSNP(filePrefix + configParser.get('general', 'src_file'), title=configParser.get('general', 'title'), version=configParser.get('general', 'version'))
        elif dsType == "ensembl":
            result = EnsemblTranscriptDatasource(filePrefix + configParser.get('general', 'src_file'),
                                                 title=configParser.get('general', 'title'),
                                                 version=configParser.get('general', 'version'),
                                                 tx_filter=configParser.get('general', 'transcript_filter'))
        elif dsType == "cosmic":
            result = Cosmic(src_file=filePrefix + configParser.get('general', 'src_file'), version=configParser.get('general', 'version'), gpp_tabix_file=filePrefix + configParser.get('general', 'gpp_src_file'))
        elif dsType == 'ref':
            if configParser.has_option('general', 'windowSizeRef'):
                window_size = configParser.get('general', 'windowSizeRef')
            else:
                window_size = 10
            result = ReferenceDatasource(filePrefix, title=configParser.get("general", "title"), version=configParser.get('general', 'version'), windowSizeRef=window_size)
        elif dsType == 'gene_tsv':
            result = GenericGeneDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'gene_col'))
        elif dsType == 'transcript_tsv':
            result = GenericTranscriptDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'transcript_col'))
        elif dsType == 'vc_tsv':
            result = GenericVariantClassificationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'vc_col'))
        elif dsType == 'gp_tsv':
            result = GenericGenomicPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'))
        elif dsType == 'gm_tsv':
            result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'))
        elif dsType == 'gm_tsv_reverse_complement':
            result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'), use_complementary_strand_alleles_for_negative_strand_transcripts=True)
        elif dsType == 'gpp_tsv':
            result = GenericGeneProteinPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'),title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'gene_protein_position_cols'))
        elif dsType == "transcript_to_uniprot_aa":
            result = TranscriptToUniProtProteinPositionTransformingDatasource(title=configParser.get("general", "title"),
                                                                              version=configParser.get('general', 'version'),
                                                                              src_file="file://" + filePrefix + configParser.get('general', 'src_file'), # three slashes for sqlite
                                                                              inputPositionAnnotationName=configParser.get('general', 'inputPositionAnnotationName'),
                                                                              outputPositionAnnotationName=configParser.get('general','outputPositionAnnotationName'))
        
        elif dsType == "mock_exception":
            result = MockExceptionThrowingDatasource(title=configParser.get("general", "title"), version=configParser.get('general', 'version'))
        elif dsType == "indexed_vcf":
            result = IndexedVcfDatasource(src_file=filePrefix + configParser.get('general', 'src_file'),
                                           title=configParser.get("general", "title"),
                                           version=configParser.get('general', 'version'),
                                           match_mode=configParser.get('general', 'match_mode'))
        elif dsType == "indexed_tsv":
            columnNames = configParser.get("general", "column_names")
            columnNames = columnNames.split(",")

            annotationColumnNames = configParser.get("general", "annotation_column_names")
            annotationColumnNames = annotationColumnNames.split(",")

            indexColumnNames = configParser.get("general", "index_column_names")
            indexColumnNames = indexColumnNames.split(",")

            DatasourceFactory._log_missing_column_name_msg(columnNames, annotationColumnNames)

            columnDataTypes = dict()
            for columnName in annotationColumnNames:
                if columnName.strip() == "":
                    continue
                columnDataTypes[columnName] = configParser.get("data_types", columnName)

            result = IndexedTsvDatasource(src_file=filePrefix + configParser.get("general", "src_file"),
                                           title=configParser.get("general", "title"),
                                           version=configParser.get("general", "version"),
                                           colNames=columnNames,
                                           annotationColNames=annotationColumnNames,
                                           indexColNames=indexColumnNames,
                                           match_mode=configParser.get("general", "match_mode"),
                                           colDataTypes=columnDataTypes)

        
        elif dsType == 'bigwig':
            if not NGSLIB_INSTALLED:
                raise RuntimeError("Bigwig datasource found in db-dir but ngslib library not installed.")
            result = BigWigDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'))
        else:
            raise RuntimeError('Unknown datasource type: %s' % dsType)


        hashcode = DatasourceFactory._retrieve_hash_code(leafDir)
        result.set_hashcode(hashcode)
        return result