def test_transcript_fasta_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids_of_gene_name("test")
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.transcript_names()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
def test_transcript_fasta_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    transcript_fasta_paths_or_urls=[
                        MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH
                    ])
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids_of_gene_name("test")
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.transcript_names()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
Ejemplo n.º 3
0
def test_protein_fasta_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH)
    genome.index()

    eq_(4, len(genome.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)
    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
Ejemplo n.º 4
0
def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(reference_name="GRCh38",
                        annotation_name="ucsc_test",
                        gtf_path_or_url=UCSC_REFSEQ_PATH,
                        cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 2, \
            "Expected 2 transcripts, got %d: %s" % (
                len(transcripts), transcripts)
        genes_at_locus = genome.genes_at_locus(1, 67092176)
        assert len(genes_at_locus) == 2, \
            "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
                len(genes_at_locus), genes_at_locus)
        ids = set([gene.id for gene in genes_at_locus])
        eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_protein_fasta_only():
    genome_only_proteins = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
    genome_only_proteins.index()

    eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome_only_proteins.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
    no_transcript_(cm)
Ejemplo n.º 6
0
def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_REFSEQ_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                 "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 2, \
            "Expected 2 transcripts, got %d: %s" % (
                len(transcripts), transcripts)
        genes_at_locus = genome.genes_at_locus(1, 67092176)
        assert len(genes_at_locus) == 2, \
            "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
                len(genes_at_locus), genes_at_locus)
        ids = set([gene.id for gene in genes_at_locus])
        eq_(set(["NM_001276352", "NR_075077"]), ids)
Ejemplo n.º 7
0
def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir,
        )
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (len(transcripts), transcripts)

        gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
        eq_(gene_uc001aak4.id, "uc001aak.4")
        eq_(gene_uc001aak4.name, None)
        eq_(gene_uc001aak4.biotype, None)

        gene_1_17369 = genome.genes_at_locus(1, 17369)
        eq_(gene_1_17369[0].id, "uc031tla.1")

        transcript_1_30564 = genome.transcripts_at_locus(1, 30564)
        eq_(transcript_1_30564[0].id, "uc057aty.1")
Ejemplo n.º 8
0
def test_ucsc_refseq():
    """
    Testing with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    genome = Genome("GRCh38", gtf_path_or_url=UCSC_REFSEQ_PATH)
    genome.install()
    eq_(len(genome.genes()), 2)
    eq_(len(genome.transcripts()), 2)

    genes_at_locus = genome.genes_at_locus(1, 67092176)
    eq_(len(genes_at_locus), 2)
    ids = set([gene.id for gene in genes_at_locus])
    eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_gtf_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
def test_gtf_transcript_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    transcript = genome.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)
Ejemplo n.º 11
0
def test_gtf_transcript_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
                    transcript_fasta_path_or_url=
                    MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    transcript = genome.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)
def test_gtf_only():
    genome_only_gtf = Genome(reference_name="GRCm38",
                             annotation_name="_test_mouse_ensembl81_subset",
                             gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome_only_gtf.index()

    eq_(1, len(genome_only_gtf.genes()))

    with assert_raises(ValueError) as cm:
        genome_only_gtf.transcript_sequence("DOES_NOT_EXIST")

    no_transcript_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_gtf.protein_sequence("genome_only_gtf")

    no_protein_(cm)
Ejemplo n.º 13
0
def test_ucsc_gencode():
    """
    Testing with a small GENCODE GTF file downloaded from 
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    genome = Genome("GRCh38", gtf_path_or_url=UCSC_GENCODE_PATH)
    genome.install()
    eq_(len(genome.genes()), 7)
    eq_(len(genome.transcripts()), 7)

    gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
    eq_(gene_uc001aak4.id, "uc001aak.4")
    eq_(gene_uc001aak4.name, None)
    eq_(gene_uc001aak4.biotype, None)

    gene_1_17369 = genome.genes_at_locus(1, 17369)
    eq_(gene_1_17369[0].id, "uc031tla.1")

    transcript_1_30564 = genome.transcripts_at_locus(1, 30564)
    eq_(transcript_1_30564[0].id, "uc057aty.1")
Ejemplo n.º 14
0
def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, \
            "Expected 7 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 7, \
            "Expected 7 transcripts, got %d: %s" % (
                len(transcripts), transcripts)

        gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
        eq_(gene_uc001aak4.id, "uc001aak.4")
        eq_(gene_uc001aak4.name, None)
        eq_(gene_uc001aak4.biotype, None)

        gene_1_17369 = genome.genes_at_locus("chr1", 17369)
        eq_(gene_1_17369[0].id, "uc031tla.1")

        transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564)
        eq_(transcript_1_30564[0].id, "uc057aty.1")