Ejemplos de Genome.Genome en Python, ejemplos de pyensembl.Genome.Genome en Python

Ejemplo n.º 1

0

Mostrar archivo

def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(reference_name="GRCh38",
                        annotation_name="ucsc_test",
                        gtf_path_or_url=UCSC_REFSEQ_PATH,
                        cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 2, \
            "Expected 2 transcripts, got %d: %s" % (
                len(transcripts), transcripts)
        genes_at_locus = genome.genes_at_locus(1, 67092176)
        assert len(genes_at_locus) == 2, \
            "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
                len(genes_at_locus), genes_at_locus)
        ids = set([gene.id for gene in genes_at_locus])
        eq_(set(["NM_001276352", "NR_075077"]), ids)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_missing_genome_sources.py Proyecto: yech1990/pyensembl

def test_transcript_fasta_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    transcript_fasta_paths_or_urls=[
                        MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH
                    ])
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids_of_gene_name("test")
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.transcript_names()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: dataprep.py Proyecto: mparker2/xpore

def main():
    args = get_args()
    #
    n_processes = args.n_processes
    eventalign_filepath = args.eventalign
    summary_filepath = args.summary
    chunk_size = args.chunk_size
    out_dir = args.out_dir
    ensembl_version = args.ensembl
    ensembl_species = args.species
    readcount_min = args.readcount_min
    readcount_max = args.readcount_max
    resume = args.resume
    genome = args.genome

    customised_genome = args.customised_genome
    if customised_genome and (None in [
            args.reference_name, args.annotation_name, args.gtf_path_or_url,
            args.transcript_fasta_paths_or_urls
    ]):
        print(
            'If you have your own customised genome not in Ensembl, please provide the following'
        )
        print('- reference_name')
        print('- annotation_name')
        print('- gtf_path_or_url')
        print('- transcript_fasta_paths_or_urls')
    else:
        reference_name = args.reference_name
        annotation_name = args.annotation_name
        gtf_path_or_url = args.gtf_path_or_url
        transcript_fasta_paths_or_urls = args.transcript_fasta_paths_or_urls

    misc.makedirs(out_dir)  #todo: check every level.

    # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position.
    if not args.skip_eventalign_indexing:
        parallel_index(eventalign_filepath, summary_filepath, chunk_size,
                       out_dir, n_processes, resume)

    # (2) Create a .json file, where the info of all reads are stored per position, for modelling.
    if genome:
        if customised_genome:
            db = Genome(
                reference_name=reference_name,
                annotation_name=annotation_name,
                gtf_path_or_url=gtf_path_or_url,
                transcript_fasta_paths_or_urls=transcript_fasta_paths_or_urls)
            # parse GTF and construct database of genomic features
            db.index()
        else:
            db = EnsemblRelease(
                ensembl_version, ensembl_species
            )  # Default: human reference genome GRCh38 release 91 used in the ont mapping.
        parallel_preprocess_gene(eventalign_filepath, db, out_dir, n_processes,
                                 readcount_min, readcount_max, resume)

    else:
        parallel_preprocess_tx(eventalign_filepath, out_dir, n_processes,
                               readcount_min, readcount_max, resume)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: annotate_xcnv.py Proyecto: nkwang24/portfolio

def annotate_xcnv():

    with open(r'Z:\gatk\data\DATA.xcnv','r') as f1:
        DATA_xcnv = list(csv.reader(f1, delimiter='\t'))
        
    data = Genome(
            reference_name='GRCh37',
            annotation_name='GRCh37',
            gtf_path_or_url=r'Z:\gatk\reference\Homo_sapiens.GRCh37.75.gtf')
    data.index()
    
    with open(r'./VPv1_exons_positive','r') as f2:
        exons_positive = list(csv.reader(f2, delimiter='\t'))
    with open(r'./VPv1_exons_negative_flipped','r') as f3:
        exons_negative = list(csv.reader(f3, delimiter='\t'))
    
    for index, row in enumerate(DATA_xcnv[1:]):
        chr = str(row[4])
        chr_start = int(re.split(':|-', row[2])[1])+1
        CNV_start = data.gene_names_at_locus(contig=chr, position=int(chr_start))
        try:
            chr_end = int(re.split(':|-', row[2])[2])
        except IndexError:
            chr_end = chr_start
        CNV_end = data.gene_names_at_locus(contig=chr, position=chr_end)
        
        for row in exons_positive:
            if chr == row[0][3:]:
                if chr_start >= int(row[1]) and chr_start <= int(row[2]):
                    match = row[3].split('_')
                    CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1))
                if chr_end >= int(row[1]) and chr_end <= int(row[2]):
                    match = row[3].split('_')
                    CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1))
                
        for row in exons_negative:
            if chr == row[0][3:]:
                if chr_start >= int(row[1]) and chr_start <= int(row[2]):
                    match = row[3].split('_')
                    CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3])
                if chr_end >= int(row[1]) and chr_end <= int(row[2]):
                    match = row[3].split('_')
                    CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3])
        
        DATA_xcnv[index+1].append(CNV_start)
        DATA_xcnv[index+1].append(CNV_end)
   
    DATA_xcnv[0] += ['CNV_START', 'CNV_END']
    
    with open(r'Z:\gatk\data\DATA_annotated.xcnv','w',newline='') as fo:
        writer = csv.writer(fo, delimiter='\t')
        for row in DATA_xcnv:
            writer.writerow(row)
    
    return

Ejemplo n.º 5

0

Mostrar archivo

def main():
    bed = sys.argv[1]
    gtf = "Homo_sapiens.GRCh38.90.gtf"
    hg38 = Genome(reference_name='GRCh38',
                  annotation_name='my_genome_features',
                  gtf_path_or_url=gtf)
    hg38.index()
    with open(bed) as f:
        for line in f:
            chromosome, start, end, *left = line.strip().split()
            r = Region(chromosome, start, end)
            r.get_symbol(hg38)
            print(line.strip(), r.gene)

Ejemplo n.º 6

0

Mostrar archivo

def test_gtf_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)

Ejemplo n.º 7

0

Mostrar archivo

def test_gtf_transcript_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
                    transcript_fasta_path_or_url=
                    MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    transcript = genome.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_missing_genome_sources.py Proyecto: yech1990/pyensembl

def test_protein_fasta_only():
    genome_only_proteins = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        protein_fasta_paths_or_urls=[
            MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH
        ])
    genome_only_proteins.index()

    eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome_only_proteins.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
    no_transcript_(cm)

Ejemplo n.º 9

0

Mostrar archivo

def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, \
            "Expected 7 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 7, \
            "Expected 7 transcripts, got %d: %s" % (
                len(transcripts), transcripts)

        gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
        eq_(gene_uc001aak4.id, "uc001aak.4")
        eq_(gene_uc001aak4.name, None)
        eq_(gene_uc001aak4.biotype, None)

        gene_1_17369 = genome.genes_at_locus("chr1", 17369)
        eq_(gene_1_17369[0].id, "uc031tla.1")

        transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564)
        eq_(transcript_1_30564[0].id, "uc057aty.1")

Ejemplo n.º 10

0

Mostrar archivo

# Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/
# mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via:
# grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50

# Tested against:
# http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167

MOUSE_ENSMUSG00000017167_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf")
MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.fa")
MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.pep")

custom_mouse_genome_grcm38_subset = Genome(
    reference_name="GRCm38",
    annotation_name="_test_mouse_ensembl81_subset",
    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
    transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH,
    protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH)


def setup_init_custom_mouse_genome():
    """
    If a unit test needs to start from a cleared cache, add this to the test
    setup.
    """
    custom_mouse_genome_grcm38_subset.clear_cache()
    custom_mouse_genome_grcm38_subset.index()

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_mouse.py Proyecto: yusuf1759/varcode

SERVER = "ftp://ftp.ensembl.org"
MOUSE_GTF_PATH = \
    SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % (
        MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE)
MOUSE_TRANSCRIPT_FASTA_PATH = \
    SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz"
MOUSE_PROTEIN_FASTA_PATH = \
    SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % (
        MOUSE_ENSEMBL_RELEASE)

MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf")

explicit_url_genome = Genome(
    reference_name="GRCm38",
    annotation_name="ensembl",
    annotation_version=MOUSE_ENSEMBL_RELEASE,
    gtf_path_or_url=MOUSE_GTF_PATH,
    transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH],
    protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH])

ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse")


def test_load_vcf_mouse_with_explicit_urls():
    variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)


def test_load_vcf_mouse_with_ensembl_release():
    variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: samfile.py Proyecto: BIOT670-Group1b/BLUE

def load_genome(file):
    global data
    data = Genome(reference_name='GRCh38',
                  annotation_name='ENSEMBL',
                  gtf_path_or_url=file)
    data.index()