Ejemplo n.º 1
0
def main():
    args = get_args()
    #
    n_processes = args.n_processes
    eventalign_filepath = args.eventalign
    summary_filepath = args.summary
    chunk_size = args.chunk_size
    out_dir = args.out_dir
    ensembl_version = args.ensembl
    ensembl_species = args.species
    readcount_min = args.readcount_min
    readcount_max = args.readcount_max
    resume = args.resume
    genome = args.genome

    customised_genome = args.customised_genome
    if customised_genome and (None in [
            args.reference_name, args.annotation_name, args.gtf_path_or_url,
            args.transcript_fasta_paths_or_urls
    ]):
        print(
            'If you have your own customised genome not in Ensembl, please provide the following'
        )
        print('- reference_name')
        print('- annotation_name')
        print('- gtf_path_or_url')
        print('- transcript_fasta_paths_or_urls')
    else:
        reference_name = args.reference_name
        annotation_name = args.annotation_name
        gtf_path_or_url = args.gtf_path_or_url
        transcript_fasta_paths_or_urls = args.transcript_fasta_paths_or_urls

    misc.makedirs(out_dir)  #todo: check every level.

    # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position.
    if not args.skip_eventalign_indexing:
        parallel_index(eventalign_filepath, summary_filepath, chunk_size,
                       out_dir, n_processes, resume)

    # (2) Create a .json file, where the info of all reads are stored per position, for modelling.
    if genome:
        if customised_genome:
            db = Genome(
                reference_name=reference_name,
                annotation_name=annotation_name,
                gtf_path_or_url=gtf_path_or_url,
                transcript_fasta_paths_or_urls=transcript_fasta_paths_or_urls)
            # parse GTF and construct database of genomic features
            db.index()
        else:
            db = EnsemblRelease(
                ensembl_version, ensembl_species
            )  # Default: human reference genome GRCh38 release 91 used in the ont mapping.
        parallel_preprocess_gene(eventalign_filepath, db, out_dir, n_processes,
                                 readcount_min, readcount_max, resume)

    else:
        parallel_preprocess_tx(eventalign_filepath, out_dir, n_processes,
                               readcount_min, readcount_max, resume)
Ejemplo n.º 2
0
def setup_create_genome():
    global mouse_genome
    mouse_genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH,
        protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH)
    mouse_genome.clear_cache()
    mouse_genome.index()
Ejemplo n.º 3
0
def main():
    bed = sys.argv[1]
    gtf = "Homo_sapiens.GRCh38.90.gtf"
    hg38 = Genome(reference_name='GRCh38',
                  annotation_name='my_genome_features',
                  gtf_path_or_url=gtf)
    hg38.index()
    with open(bed) as f:
        for line in f:
            chromosome, start, end, *left = line.strip().split()
            r = Region(chromosome, start, end)
            r.get_symbol(hg38)
            print(line.strip(), r.gene)
Ejemplo n.º 4
0
def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir,
        )
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (len(transcripts), transcripts)

        gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
        eq_(gene_uc001aak4.id, "uc001aak.4")
        eq_(gene_uc001aak4.name, None)
        eq_(gene_uc001aak4.biotype, None)

        gene_1_17369 = genome.genes_at_locus(1, 17369)
        eq_(gene_1_17369[0].id, "uc031tla.1")

        transcript_1_30564 = genome.transcripts_at_locus(1, 30564)
        eq_(transcript_1_30564[0].id, "uc057aty.1")
Ejemplo n.º 5
0
def test_mouse_ENSMUSG00000017167():
    """
    GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/
    Mus_musculus.GRCm38.81.gtf.gz via:
    grep "ENSMUSG00000017167" Mus_musculus.GRCm38.81.gtf

    Transcript FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/
    fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz via: 
    grep "ENSMUSG00000017167" Mus_musculus.GRCm38.cdna.all.fa -A 50

    Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/
    mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via:
    grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50

    Tested against:
    http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167
    """
    genome = Genome(
        reference_name="GRCm38",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH,
        protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH,
    )
    genome.install()
    genes_cntnap1 = genome.genes_by_name("Cntnap1")
    eq_(len(genes_cntnap1), 1)
    gene_cntnap1 = genes_cntnap1[0]
    transcripts_cntnap1 = gene_cntnap1.transcripts
    eq_(len(transcripts_cntnap1), 2)
    transcripts_coding_cntnap1 = [
        transcript for transcript in transcripts_cntnap1 if transcript.biotype == "protein_coding"
    ]
    eq_(len(transcripts_coding_cntnap1), 1)
    transcript_cntnap1 = transcripts_coding_cntnap1[0]
    eq_(
        transcript_cntnap1.sequence[:120],
        (
            "GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"
            "GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT"
        ),
    )
    eq_(
        transcript_cntnap1.protein_sequence[:120],
        (
            "MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS"
            "GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH"
        ),
    )
def test_gtf_transcript_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    transcript = genome.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)
Ejemplo n.º 7
0
def test_gtf_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
Ejemplo n.º 8
0
def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(reference_name="GRCh38",
                        annotation_name="ucsc_test",
                        gtf_path_or_url=UCSC_REFSEQ_PATH,
                        cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 2, \
            "Expected 2 transcripts, got %d: %s" % (
                len(transcripts), transcripts)
        genes_at_locus = genome.genes_at_locus(1, 67092176)
        assert len(genes_at_locus) == 2, \
            "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
                len(genes_at_locus), genes_at_locus)
        ids = set([gene.id for gene in genes_at_locus])
        eq_(set(["NM_001276352", "NR_075077"]), ids)
Ejemplo n.º 9
0
def get_gene_ids_of_transcript_id(transcript_id: str,
                                  ensembl: pyensembl.Genome,
                                  raise_on_error: bool = False):
    """ Extract all gene ids associated with the given transcript.

    Parameters
    ----------
    transcript_id: string
        The transcript identifier

    ensembl: pyensembl.Genome
        The annotations

    raise_on_error: bool
        Whether to raise an exception if the transcript id is not found in the
        annotations database

    Returns
    -------
    transcript_gene_id_df: pd.DataFrame
        A dataframe with columns to map between transcripts and genes. Its
        columns are:

            transcript_id
            gene_id
    """
    try:
        gene_name = ensembl.gene_name_of_transcript_id(transcript_id)
        gene_ids = ensembl.gene_ids_of_gene_name(gene_name)
    except ValueError as ve:
        msg = ("['pyensembl_utils.get_gene_ids_of_transcript_id]: could not "
               "find transcript id in database: {}".format(transcript_id))
        if raise_on_error:
            raise ValueError(msg) from ve
        else:
            logger.warning(msg)
            return None

    ret = [{'transcript_id': transcript_id, 'gene_id': g} for g in gene_ids]

    return ret
Ejemplo n.º 10
0
def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_REFSEQ_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                 "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 2, \
            "Expected 2 transcripts, got %d: %s" % (
                len(transcripts), transcripts)
        genes_at_locus = genome.genes_at_locus(1, 67092176)
        assert len(genes_at_locus) == 2, \
            "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
                len(genes_at_locus), genes_at_locus)
        ids = set([gene.id for gene in genes_at_locus])
        eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_gtf_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
Ejemplo n.º 12
0
def test_protein_fasta_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    protein_fasta_paths_or_urls=[
                        MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH
                    ])
    genome.index()

    eq_(4, len(genome.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)
    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
Ejemplo n.º 13
0
def annotate_xcnv():

    with open(r'Z:\gatk\data\DATA.xcnv','r') as f1:
        DATA_xcnv = list(csv.reader(f1, delimiter='\t'))
        
    data = Genome(
            reference_name='GRCh37',
            annotation_name='GRCh37',
            gtf_path_or_url=r'Z:\gatk\reference\Homo_sapiens.GRCh37.75.gtf')
    data.index()
    
    with open(r'./VPv1_exons_positive','r') as f2:
        exons_positive = list(csv.reader(f2, delimiter='\t'))
    with open(r'./VPv1_exons_negative_flipped','r') as f3:
        exons_negative = list(csv.reader(f3, delimiter='\t'))
    
    for index, row in enumerate(DATA_xcnv[1:]):
        chr = str(row[4])
        chr_start = int(re.split(':|-', row[2])[1])+1
        CNV_start = data.gene_names_at_locus(contig=chr, position=int(chr_start))
        try:
            chr_end = int(re.split(':|-', row[2])[2])
        except IndexError:
            chr_end = chr_start
        CNV_end = data.gene_names_at_locus(contig=chr, position=chr_end)
        
        for row in exons_positive:
            if chr == row[0][3:]:
                if chr_start >= int(row[1]) and chr_start <= int(row[2]):
                    match = row[3].split('_')
                    CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1))
                if chr_end >= int(row[1]) and chr_end <= int(row[2]):
                    match = row[3].split('_')
                    CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1))
                
        for row in exons_negative:
            if chr == row[0][3:]:
                if chr_start >= int(row[1]) and chr_start <= int(row[2]):
                    match = row[3].split('_')
                    CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3])
                if chr_end >= int(row[1]) and chr_end <= int(row[2]):
                    match = row[3].split('_')
                    CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3])
        
        DATA_xcnv[index+1].append(CNV_start)
        DATA_xcnv[index+1].append(CNV_end)
   
    DATA_xcnv[0] += ['CNV_START', 'CNV_END']
    
    with open(r'Z:\gatk\data\DATA_annotated.xcnv','w',newline='') as fo:
        writer = csv.writer(fo, delimiter='\t')
        for row in DATA_xcnv:
            writer.writerow(row)
    
    return
def test_protein_fasta_only():
    genome_only_proteins = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
    genome_only_proteins.index()

    eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome_only_proteins.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
    no_transcript_(cm)
Ejemplo n.º 15
0
def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, \
            "Expected 7 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 7, \
            "Expected 7 transcripts, got %d: %s" % (
                len(transcripts), transcripts)

        gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
        eq_(gene_uc001aak4.id, "uc001aak.4")
        eq_(gene_uc001aak4.name, None)
        eq_(gene_uc001aak4.biotype, None)

        gene_1_17369 = genome.genes_at_locus("chr1", 17369)
        eq_(gene_1_17369[0].id, "uc031tla.1")

        transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564)
        eq_(transcript_1_30564[0].id, "uc057aty.1")
Ejemplo n.º 16
0
def test_ucsc_gencode():
    """
    Testing with a small GENCODE GTF file downloaded from 
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    genome = Genome("GRCh38", gtf_path_or_url=UCSC_GENCODE_PATH)
    genome.install()
    eq_(len(genome.genes()), 7)
    eq_(len(genome.transcripts()), 7)

    gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
    eq_(gene_uc001aak4.id, "uc001aak.4")
    eq_(gene_uc001aak4.name, None)
    eq_(gene_uc001aak4.biotype, None)

    gene_1_17369 = genome.genes_at_locus(1, 17369)
    eq_(gene_1_17369[0].id, "uc031tla.1")

    transcript_1_30564 = genome.transcripts_at_locus(1, 30564)
    eq_(transcript_1_30564[0].id, "uc057aty.1")
Ejemplo n.º 17
0
def get_gene_name_of_transcript_id(transcript_id: str,
                                   ensembl: pyensembl.Genome,
                                   raise_on_error: bool = False):
    """ Extract the gene name (symbol) for this transcript id.

    The difference between this function and gene_name_of_transcript_id is that
    this function will (optionally) issue a warning rather than raise an
    exception for transcript ids not in the database.

    Parameters
    ----------
    transcript_id: string
        The transcript identifier (e.g., "ENSMUST00000035194")

    ensembl: pyensembl.Genome
        The annotation database

    raise_on_error: bool
        Whether to issue a warning (False) or raise a ValueError (True) if the
        transcript identifier is not in the annotation database

    Returns
    -------
    gene_name: string
        The gene name (also called gene symbol, e.g., "Mapkapk3")

    --- OR ---

    None, if the transcript id is not in the database of annotations
    """
    gene_name = None
    try:
        gene_name = ensembl.gene_name_of_transcript_id(transcript_id)
    except ValueError as ve:
        msg = ("[pyensembl_utils.get_gene_name_of_transcript_id]: could not "
               "find match for transcript id: {}".format(transcript_id))

        if raise_on_error:
            raise ValueError(msg) from ve
        else:
            logger.warning(msg)

    return gene_name
Ejemplo n.º 18
0
def test_ucsc_refseq():
    """
    Testing with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    genome = Genome("GRCh38", gtf_path_or_url=UCSC_REFSEQ_PATH)
    genome.install()
    eq_(len(genome.genes()), 2)
    eq_(len(genome.transcripts()), 2)

    genes_at_locus = genome.genes_at_locus(1, 67092176)
    eq_(len(genes_at_locus), 2)
    ids = set([gene.id for gene in genes_at_locus])
    eq_(set(["NM_001276352", "NR_075077"]), ids)
Ejemplo n.º 19
0
def test_gtf_transcript_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
                    transcript_fasta_path_or_url=
                    MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    transcript = genome.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)
Ejemplo n.º 20
0
SERVER = "ftp://ftp.ensembl.org"
MOUSE_GTF_PATH = \
    SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % (
        MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE)
MOUSE_TRANSCRIPT_FASTA_PATH = \
    SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz"
MOUSE_PROTEIN_FASTA_PATH = \
    SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % (
        MOUSE_ENSEMBL_RELEASE)

MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf")

explicit_url_genome = Genome(
    reference_name="GRCm38",
    annotation_name="ensembl",
    annotation_version=MOUSE_ENSEMBL_RELEASE,
    gtf_path_or_url=MOUSE_GTF_PATH,
    transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH],
    protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH])

ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse")


def test_load_vcf_mouse_with_explicit_urls():
    variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)


def test_load_vcf_mouse_with_ensembl_release():
    variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
Ejemplo n.º 21
0
# Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/
# mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via:
# grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50

# Tested against:
# http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167

MOUSE_ENSMUSG00000017167_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf")
MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.fa")
MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.pep")

custom_mouse_genome_grcm38_subset = Genome(
    reference_name="GRCm38",
    annotation_name="_test_mouse_ensembl81_subset",
    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
    transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH,
    protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH)


def setup_init_custom_mouse_genome():
    """
    If a unit test needs to start from a cleared cache, add this to the test
    setup.
    """
    custom_mouse_genome_grcm38_subset.clear_cache()
    custom_mouse_genome_grcm38_subset.index()
Ejemplo n.º 22
0
def test_custom_genome_to_dict():
    reconstructed = Genome.from_dict(
        custom_mouse_genome_grcm38_subset.to_dict())
    eq_(custom_mouse_genome_grcm38_subset, reconstructed)
Ejemplo n.º 23
0
def load_genome(file):
    global data
    data = Genome(reference_name='GRCh38',
                  annotation_name='ENSEMBL',
                  gtf_path_or_url=file)
    data.index()
Ejemplo n.º 24
0
def test_custom_genome_to_dict():
    reconstructed = Genome.from_dict(custom_mouse_genome_grcm38_subset.to_dict())
    eq_(custom_mouse_genome_grcm38_subset, reconstructed)
Ejemplo n.º 25
0
def test_custom_genome_to_json():
    json = custom_mouse_genome_grcm38_subset.to_json()
    reconstructed = Genome.from_json(json)
    eq_(custom_mouse_genome_grcm38_subset, reconstructed)
Ejemplo n.º 26
0
def test_custom_genome_to_json():
    json = custom_mouse_genome_grcm38_subset.to_json()
    reconstructed = Genome.from_json(json)
    eq_(custom_mouse_genome_grcm38_subset, reconstructed)
Ejemplo n.º 27
0
def test_transcript_fasta_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    transcript_fasta_path_or_url=
                    MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids_of_gene_name("test")
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.transcript_names()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
def test_transcript_fasta_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids_of_gene_name("test")
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.transcript_names()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)