Ejemplo n.º 1
0
def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(reference_name="GRCh38",
                        annotation_name="ucsc_test",
                        gtf_path_or_url=UCSC_REFSEQ_PATH,
                        cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 2, \
            "Expected 2 transcripts, got %d: %s" % (
                len(transcripts), transcripts)
        genes_at_locus = genome.genes_at_locus(1, 67092176)
        assert len(genes_at_locus) == 2, \
            "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
                len(genes_at_locus), genes_at_locus)
        ids = set([gene.id for gene in genes_at_locus])
        eq_(set(["NM_001276352", "NR_075077"]), ids)
Ejemplo n.º 2
0
def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir,
        )
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (len(transcripts), transcripts)

        gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
        eq_(gene_uc001aak4.id, "uc001aak.4")
        eq_(gene_uc001aak4.name, None)
        eq_(gene_uc001aak4.biotype, None)

        gene_1_17369 = genome.genes_at_locus(1, 17369)
        eq_(gene_1_17369[0].id, "uc031tla.1")

        transcript_1_30564 = genome.transcripts_at_locus(1, 30564)
        eq_(transcript_1_30564[0].id, "uc057aty.1")
Ejemplo n.º 3
0
def main():
    args = get_args()
    #
    n_processes = args.n_processes
    eventalign_filepath = args.eventalign
    summary_filepath = args.summary
    chunk_size = args.chunk_size
    out_dir = args.out_dir
    ensembl_version = args.ensembl
    ensembl_species = args.species
    readcount_min = args.readcount_min
    readcount_max = args.readcount_max
    resume = args.resume
    genome = args.genome

    customised_genome = args.customised_genome
    if customised_genome and (None in [
            args.reference_name, args.annotation_name, args.gtf_path_or_url,
            args.transcript_fasta_paths_or_urls
    ]):
        print(
            'If you have your own customised genome not in Ensembl, please provide the following'
        )
        print('- reference_name')
        print('- annotation_name')
        print('- gtf_path_or_url')
        print('- transcript_fasta_paths_or_urls')
    else:
        reference_name = args.reference_name
        annotation_name = args.annotation_name
        gtf_path_or_url = args.gtf_path_or_url
        transcript_fasta_paths_or_urls = args.transcript_fasta_paths_or_urls

    misc.makedirs(out_dir)  #todo: check every level.

    # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position.
    if not args.skip_eventalign_indexing:
        parallel_index(eventalign_filepath, summary_filepath, chunk_size,
                       out_dir, n_processes, resume)

    # (2) Create a .json file, where the info of all reads are stored per position, for modelling.
    if genome:
        if customised_genome:
            db = Genome(
                reference_name=reference_name,
                annotation_name=annotation_name,
                gtf_path_or_url=gtf_path_or_url,
                transcript_fasta_paths_or_urls=transcript_fasta_paths_or_urls)
            # parse GTF and construct database of genomic features
            db.index()
        else:
            db = EnsemblRelease(
                ensembl_version, ensembl_species
            )  # Default: human reference genome GRCh38 release 91 used in the ont mapping.
        parallel_preprocess_gene(eventalign_filepath, db, out_dir, n_processes,
                                 readcount_min, readcount_max, resume)

    else:
        parallel_preprocess_tx(eventalign_filepath, out_dir, n_processes,
                               readcount_min, readcount_max, resume)
def test_transcript_fasta_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids_of_gene_name("test")
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.transcript_names()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
def test_transcript_fasta_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    transcript_fasta_paths_or_urls=[
                        MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH
                    ])
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids_of_gene_name("test")
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.transcript_names()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
Ejemplo n.º 6
0
def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_REFSEQ_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                 "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 2, \
            "Expected 2 transcripts, got %d: %s" % (
                len(transcripts), transcripts)
        genes_at_locus = genome.genes_at_locus(1, 67092176)
        assert len(genes_at_locus) == 2, \
            "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
                len(genes_at_locus), genes_at_locus)
        ids = set([gene.id for gene in genes_at_locus])
        eq_(set(["NM_001276352", "NR_075077"]), ids)
Ejemplo n.º 7
0
def setup_create_genome():
    global mouse_genome
    mouse_genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH,
        protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH)
    mouse_genome.clear_cache()
    mouse_genome.index()
Ejemplo n.º 8
0
def annotate_xcnv():

    with open(r'Z:\gatk\data\DATA.xcnv','r') as f1:
        DATA_xcnv = list(csv.reader(f1, delimiter='\t'))
        
    data = Genome(
            reference_name='GRCh37',
            annotation_name='GRCh37',
            gtf_path_or_url=r'Z:\gatk\reference\Homo_sapiens.GRCh37.75.gtf')
    data.index()
    
    with open(r'./VPv1_exons_positive','r') as f2:
        exons_positive = list(csv.reader(f2, delimiter='\t'))
    with open(r'./VPv1_exons_negative_flipped','r') as f3:
        exons_negative = list(csv.reader(f3, delimiter='\t'))
    
    for index, row in enumerate(DATA_xcnv[1:]):
        chr = str(row[4])
        chr_start = int(re.split(':|-', row[2])[1])+1
        CNV_start = data.gene_names_at_locus(contig=chr, position=int(chr_start))
        try:
            chr_end = int(re.split(':|-', row[2])[2])
        except IndexError:
            chr_end = chr_start
        CNV_end = data.gene_names_at_locus(contig=chr, position=chr_end)
        
        for row in exons_positive:
            if chr == row[0][3:]:
                if chr_start >= int(row[1]) and chr_start <= int(row[2]):
                    match = row[3].split('_')
                    CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1))
                if chr_end >= int(row[1]) and chr_end <= int(row[2]):
                    match = row[3].split('_')
                    CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1))
                
        for row in exons_negative:
            if chr == row[0][3:]:
                if chr_start >= int(row[1]) and chr_start <= int(row[2]):
                    match = row[3].split('_')
                    CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3])
                if chr_end >= int(row[1]) and chr_end <= int(row[2]):
                    match = row[3].split('_')
                    CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3])
        
        DATA_xcnv[index+1].append(CNV_start)
        DATA_xcnv[index+1].append(CNV_end)
   
    DATA_xcnv[0] += ['CNV_START', 'CNV_END']
    
    with open(r'Z:\gatk\data\DATA_annotated.xcnv','w',newline='') as fo:
        writer = csv.writer(fo, delimiter='\t')
        for row in DATA_xcnv:
            writer.writerow(row)
    
    return
Ejemplo n.º 9
0
def main():
    bed = sys.argv[1]
    gtf = "Homo_sapiens.GRCh38.90.gtf"
    hg38 = Genome(reference_name='GRCh38',
                  annotation_name='my_genome_features',
                  gtf_path_or_url=gtf)
    hg38.index()
    with open(bed) as f:
        for line in f:
            chromosome, start, end, *left = line.strip().split()
            r = Region(chromosome, start, end)
            r.get_symbol(hg38)
            print(line.strip(), r.gene)
Ejemplo n.º 10
0
def test_gtf_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
def test_gtf_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    with assert_raises(ValueError) as cm:
        genome.transcript_sequence("test")
    no_transcript_(cm)
    with assert_raises(ValueError) as cm:
        genome.protein_sequence("test")
    no_protein_(cm)
def test_gtf_transcript_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    transcript = genome.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)
Ejemplo n.º 13
0
def test_gtf_transcript_only():
    genome = Genome(reference_name="GRCm38",
                    annotation_name="_test_mouse_ensembl81_subset",
                    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
                    transcript_fasta_path_or_url=
                    MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH)
    genome.index()

    eq_(1, len(genome.genes()))

    transcript = genome.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)
def test_protein_fasta_only():
    genome_only_proteins = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
    genome_only_proteins.index()

    eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome_only_proteins.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
    no_transcript_(cm)
def test_protein_fasta_only():
    genome_only_proteins = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        protein_fasta_paths_or_urls=[
            MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH
        ])
    genome_only_proteins.index()

    eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome_only_proteins.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
    no_transcript_(cm)
Ejemplo n.º 16
0
def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, \
            "Expected 7 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(transcripts) == 7, \
            "Expected 7 transcripts, got %d: %s" % (
                len(transcripts), transcripts)

        gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
        eq_(gene_uc001aak4.id, "uc001aak.4")
        eq_(gene_uc001aak4.name, None)
        eq_(gene_uc001aak4.biotype, None)

        gene_1_17369 = genome.genes_at_locus("chr1", 17369)
        eq_(gene_1_17369[0].id, "uc031tla.1")

        transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564)
        eq_(transcript_1_30564[0].id, "uc057aty.1")
Ejemplo n.º 17
0
def load_genome(file):
    global data
    data = Genome(reference_name='GRCh38',
                  annotation_name='ENSEMBL',
                  gtf_path_or_url=file)
    data.index()