def test_knownGene():
    # To speed up testing, we'll download the file and reuse the downloaded copy
    knownGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownGene.txt.gz'
    # Mirror. Slightly faster and more stable, I believe:
    knownGene_url = 'http://kt.era.ee/distribute/pyintervaltree/knownGene.txt.gz'

    # To speed up testing, we'll download the file and reuse the downloaded copy
    knownGene_file, headers = urlretrieve(knownGene_url)
    
    knownGene_localurl = 'file:///%s' % os.path.abspath(knownGene_file)
    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, decompress=True) # Py3 downloads .gz files to local files with names not ending with .gz
    assert len(knownGene) == 82960
    result = knownGene[b'chr1'].search(100000, 138529)
    assert len(result) == 1
    assert list(result)[0].data['name'] == b'uc021oeg.2'
    
    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='cds', decompress=True)
    assert len(knownGene) == 82960
    assert not knownGene[b'chr1'].overlaps(100000, 138529)
    
    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='exons', decompress=True)
    assert len(knownGene) == 742493
    result = list(knownGene[b'chr1'].search(134772, 140566))
    assert len(result) == 3
    assert result[0].data == result[1].data and result[0].data == result[2].data
Ejemplo n.º 2
0
def test_knownGene(base_url):
    # To speed up testing, we'll download the file and reuse the downloaded copy
    knownGene_url = base_url + 'knownGene.txt.gz'

    # To speed up testing, we'll download the file and reuse the downloaded copy
    knownGene_file, headers = urlretrieve(knownGene_url)

    knownGene_localurl = 'file:///%s' % os.path.abspath(knownGene_file)
    knownGene = GenomeIntervalTree.from_table(
        url=knownGene_localurl, decompress=True
    )  # Py3 downloads .gz files to local files with names not ending with .gz
    assert len(knownGene) == 82960
    result = knownGene[b'chr1'].search(100000, 138529)
    assert len(result) == 1
    assert list(result)[0].data['name'] == b'uc021oeg.2'

    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl,
                                              mode='cds',
                                              decompress=True)
    assert len(knownGene) == 82960
    assert not knownGene[b'chr1'].overlaps(100000, 138529)

    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl,
                                              mode='exons',
                                              decompress=True)
    assert len(knownGene) == 742493
    result = list(knownGene[b'chr1'].search(134772, 140566))
    assert len(result) == 3
    assert result[0].data == result[1].data and result[0].data == result[2].data
Ejemplo n.º 3
0
def _test_promotorsearch():
    # Realistic example: find a promotor of a given gene ('NANOG', for example)
    # It is slow, so you don't want to run it too much.

    from intervaltree_bio import GenomeIntervalTree, UCSCTable
    # Download refGene table
    refGene = GenomeIntervalTree.from_table(
        url=
        'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz',
        parser=UCSCTable.REF_GENE)
    # Find the NANOG gene
    nanog = [
        i for chrom in refGene for i in refGene[chrom]
        if i.data['name2'] == 'NANOG'
    ]
    nanog = nanog[0]

    # Download genome segmentation table
    e = Encode()
    segments = e.AwgSegmentation.CombinedHepg2.fetch().read_as_intervaltree()

    # Find the segmentation of the NANOG transcript +- 10kb
    results = segments[nanog.data['chrom']].search(nanog.begin - 10000,
                                                   nanog.end + 10000)

    # Leave the promotor/promotor flanking segments only
    results = [i for i in results if i.data[0] in ['PF', 'P']]
    print results
Ejemplo n.º 4
0
def create_gene_tree(bed_file_path):

    # dictionary mapping chromosome names to interval trees
    models = dict()
    #dmRNA = getmRNAlengths(gff3_file_path)
    tree = GenomeIntervalTree()
    # parse the annotations file (GFF3) and build the interval trees
    with open(bed_file_path, 'r') as annotations_file:
        reader = csv.reader(annotations_file, delimiter='\t')
        for row in reader:
            if len(row) == 9 and not row[0].startswith('##'):
                seqid = row[0]
                start = int(row[1])
                end = int(row[2])
                strand = row[3]
                m_id = row[4]
                g_id = row[5]
                cov = float(row[6])
                idty = float(row[7])
                matches = int(row[8])
                #tree = None
                if tree[seqid].overlaps(start, end):
                    continue
                else:
                    models[m_id] = 1
                    models[g_id] = 1
                    tree[seqid].addi(start,
                                     end,
                                     data=({
                                         "ID": m_id,
                                         "Parent": g_id
                                     }))
    return models
Ejemplo n.º 5
0
def test_ensGene(base_url):
    # Smoke-test we can at least read ensGene.
    ensGene_url = base_url + 'ensGene.txt.gz'
    ensGene = GenomeIntervalTree.from_table(url=ensGene_url,
                                            mode='cds',
                                            parser=UCSCTable.ENS_GENE)
    assert len(ensGene) == 204940
Ejemplo n.º 6
0
def test_genepred():
    # Smoke-test for output from gtfToGenePred
    testdir = os.path.join(os.path.dirname(__file__), 'test_data')
    kg = open(os.path.abspath(os.path.join(testdir, "test_genepred.txt")))
    gtree = GenomeIntervalTree.from_table(fileobj=kg,
                                          mode='cds',
                                          parser=UCSCTable.GENEPRED)
    assert len(gtree) == 100
Ejemplo n.º 7
0
def test_refGene(base_url):
    # Smoke-test for refGene
    refGene_url = base_url + 'refGene.txt.gz'
    refGene = GenomeIntervalTree.from_table(url=refGene_url,
                                            mode='tx',
                                            parser=UCSCTable.REF_GENE)
    assert len(
        refGene
    ) == 52350  # NB: Some time ago it was 50919, hence it seems the table data changes on UCSC and eventually the mirror and UCSC won't be the same.
Ejemplo n.º 8
0
def test_pickling():
    git = GenomeIntervalTree()
    git['a'][1:2] = ['some', 'data']
    git['a'][1.5:2.5] = ['more', 'data']
    git['b'][10:12] = ['even', 'more', 'data']
    s = pickle.dumps(git)
    new_git = pickle.loads(s)
    assert len(git) == len(new_git)
    assert len(git['a']) == len(new_git['a'])
Ejemplo n.º 9
0
def gtf2tree(gtf_path):
    genepred_annot = os.path.splitext(gtf_path)[0] + ".genePred"
    ucsc_annot = os.path.splitext(gtf_path)[0] + ".UCSCTable.gz"
    gtf_to_genepred(gtf_path, genepred_annot)
    genepred_to_UCSCtable(genepred_annot, ucsc_annot)
    kg = gzip.open(ucsc_annot)
    gtree = GenomeIntervalTree.from_table(fileobj=kg,
                                          mode='tx',
                                          parser=UCSCTable.ENS_GENE)
    return gtree
Ejemplo n.º 10
0
 def read_as_intervaltree(self):
     '''
     Reads the data from a 'bed' file into an ``intervaltree_bio.GenomeIntervalTree`` data structure.
     Similarly to ``open`` and ``open_text`` it won't download file to cache, if it is not there.
     Reads the whole file to memory during its work.
     
     The file must be a `bed` or `bed.gz` file.
     The ``data`` field of each interval will contain the result of ``ln.split('\t')[3:]`` applied to the corresponding line of the ``bed`` file.
     
     Returns:
         a GenomeIntervalTree instance.
     '''
     assert self['type'] in ['bed', 'narrowPeak', 'broadPeak']
     
     with self.open_text() as f:
         gtree = GenomeIntervalTree.from_bed(fileobj=f)
     return gtree
Ejemplo n.º 11
0
    def read_as_intervaltree(self):
        '''
        Reads the data from a 'bed' file into an ``intervaltree_bio.GenomeIntervalTree`` data structure.
        Similarly to ``open`` and ``open_text`` it won't download file to cache, if it is not there.
        Reads the whole file to memory during its work.
        
        The file must be a `bed` or `bed.gz` file.
        The ``data`` field of each interval will contain the result of ``ln.split('\t')[3:]`` applied to the corresponding line of the ``bed`` file.
        
        Returns:
            a GenomeIntervalTree instance.
        '''
        assert self['type'] in ['bed', 'narrowPeak', 'broadPeak']

        with self.open_text() as f:
            gtree = GenomeIntervalTree.from_bed(fileobj=f)
        return gtree
Ejemplo n.º 12
0
def _test_promotorsearch():
    # Realistic example: find a promotor of a given gene ('NANOG', for example)
    # It is slow, so you don't want to run it too much.
    
    from intervaltree_bio import GenomeIntervalTree, UCSCTable
    # Download refGene table
    refGene = GenomeIntervalTree.from_table(url='http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz', parser=UCSCTable.REF_GENE)
    # Find the NANOG gene
    nanog = [i for chrom in refGene for i in refGene[chrom] if i.data['name2'] == 'NANOG']
    nanog = nanog[0]
    
    # Download genome segmentation table
    e = Encode()
    segments = e.AwgSegmentation.CombinedHepg2.fetch().read_as_intervaltree()
    
    # Find the segmentation of the NANOG transcript +- 10kb
    results = segments[nanog.data['chrom']].search(nanog.begin-10000, nanog.end+10000)
    
    # Leave the promotor/promotor flanking segments only
    results = [i for i in results if i.data[0] in ['PF', 'P']]
    print results
Ejemplo n.º 13
0
class AnnoTestCase(unittest.TestCase):
    """Tests annotation"""
    mygtf = os.path.realpath("test_data/test.gtf.gz")
    genepred_annot = os.path.splitext(mygtf)[0] + ".genePred"
    ucsc_annot = os.path.splitext(mygtf)[0] + ".UCSCTable.gz"
    su.gtf_convert.gtf_to_genepred(mygtf, genepred_annot)
    su.gtf_convert.genepred_to_UCSCtable(genepred_annot, ucsc_annot)
    su.gtf_convert.gtf_to_genepred(mygtf, genepred_annot)
    su.gtf_convert.genepred_to_UCSCtable(genepred_annot, ucsc_annot)
    kg = gzip.open(ucsc_annot)
    global gtree
    gtree = GenomeIntervalTree.from_table(fileobj=kg, mode='tx', parser=UCSCTable.ENS_GENE)

    def test_get_jxnside_anno_v1(self):
        """test get_jxnside_anno"""
        jxn_filt = pd.DataFrame({'name': 'chr1:871160:+:chr1:985950:-:0:0',
                                 'jxn_reads': 'a1,a2,a3',
                                 'jxn_counts': 3,
                                 'spans': 5,
                                 'spanreads': 's1,s2,s3,s4,s5',
                                 'dist': 114790,
                                 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'},
                                index=pd.Series(0))
        jxn_filt['left_symbol'], jxn_filt['left_annot'], jxn_filt['left_strand'], jxn_filt['left_cdslen'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 1), axis=1))
        jxn_filt['right_symbol'], jxn_filt['right_annot'], jxn_filt['right_strand'], jxn_filt['right_cdslen'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 2), axis=1))
        assert(jxn_filt['left_symbol'].iloc[0] == 'SAMD11')
        assert(jxn_filt['right_symbol'].iloc[0] == 'AGRN')
        assert(jxn_filt['left_strand'].iloc[0] == '+')
        assert(jxn_filt['right_strand'].iloc[0] == '+')
        assert(jxn_filt['left_cdslen'].iloc[0] == 9852)
        assert(jxn_filt['right_cdslen'].iloc[0] == 3395)

    def test_get_jxnside_anno_v2(self):
        """test get_jxnside_anno.. Tests where no transcripts found"""
        jxn_filt = pd.DataFrame({'name': 'chr19:560462:+:chr8:560462:-:0:0',
                                 'jxn_reads': 'a1,a2,a3',
                                 'jxn_counts': 3,
                                 'spans': 5,
                                 'spanreads': 's1,s2,s3,s4,s5',
                                 'dist': 13082084,
                                 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'},
                                index=pd.Series(0))
        jxn_filt['left_symbol'], jxn_filt['left_annot'], jxn_filt['left_strand'], jxn_filt['left_cdslen']= zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 1), axis=1))
        assert(jxn_filt['left_symbol'].iloc[0] == 'NA')
        assert(jxn_filt['left_strand'].iloc[0] == 'NA')
        assert(jxn_filt['left_cdslen'].iloc[0] == 'NA')

    def test_get_jxnside_anno_v3(self):
        """test get_jxnside_anno.. Tests non-coding transcripts for sorting"""
        jxn_filt = pd.DataFrame({'name': 'chr12:15704606:+:chr1:11918527:-:2:1',
                                 'jxn_reads': 'a1,a2,a3',
                                 'jxn_counts': 3,
                                 'spans': 5,
                                 'spanreads': 's1,s2,s3,s4,s5',
                                 'dist': 'NA',
                                 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'},
                                index=pd.Series(0))
        jxn_filt['left_symbol'], jxn_filt['left_annot'], jxn_filt['left_strand'], jxn_filt['left_cdslen'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 1), axis=1))
        assert(jxn_filt['left_symbol'].iloc[0] == 'NA')
        assert(jxn_filt['left_strand'].iloc[0] == 'NA')
        assert(jxn_filt['left_cdslen'].iloc[0] == 'NA')

    def test_get_jxn_genes(self):
        """test_get_jxn_genes"""
        jxn_filt = pd.DataFrame({'name': 'chr1:871160:+:chr1:985950:-:0:0',
                                 'jxn_reads': 'a1,a2,a3',
                                 'jxn_counts': 3,
                                 'spans': 5,
                                 'spanreads': 's1,s2,s3,s4,s5',
                                 'dist': 114790,
                                 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'},
                                index=pd.Series(0))
        jxn_filt['left_all'], jxn_filt['right_all'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxn_genes(x['name'], gtree), axis=1))
        assert(jxn_filt['left_all'].iloc[0] == ['SAMD11'])
        assert(jxn_filt['right_all'].loc[0] == ['AGRN'])

    def test_get_pos_genes(self):
        """test_get_pos_genes"""
        assert(su.annotate_sv.get_pos_genes('chr1', 871160, gtree) == ['SAMD11'])
def test_refGene():
    # Smoke-test for refGene
    refGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz'
    refGene_url = 'http://kt.era.ee/distribute/pyintervaltree/refGene.txt.gz'
    refGene = GenomeIntervalTree.from_table(url=refGene_url, mode='tx', parser=UCSCTable.REF_GENE)
    assert len(refGene) == 52350  # NB: Some time ago it was 50919, hence it seems the table data changes on UCSC and eventually the mirror and UCSC won't be the same.
def test_ensGene():
    # Smoke-test we can at least read ensGene.
    ensGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/ensGene.txt.gz'
    ensGene_url = 'http://kt.era.ee/distribute/pyintervaltree/ensGene.txt.gz'
    ensGene = GenomeIntervalTree.from_table(url=ensGene_url, mode='cds', parser=UCSCTable.ENS_GENE)
    assert len(ensGene) == 204940
Ejemplo n.º 16
0
def bed_to_tree(bed):
    with open(bed, 'r') as f:
        btree = GenomeIntervalTree.from_bed(fileobj=f)
    return btree