def make_db():
    """
    Create gffutils database
    """
    size, md5, fn = DB
    if not _up_to_date(md5, fn):
        gffutils.create_db(fn.replace('.db', ''), fn, verbose=True, force=True)
Beispiel #2
0
def add_removed_evm(pasa, exon, wd):
    """
    here the clusters of sequence from the same locus are prepared
    """

    db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")]


    uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap]
    uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)]
    uniq = list(set(uniq_gene))


    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name
Beispiel #3
0
def make_db(i):
    """
    Module-level function that can be pickled across processes for
    multiprocessing testing.
    """
    gffutils.create_db(fn, ':memory:', _keep_tempfiles='.%s' % i)
    return i
Beispiel #4
0
def test_create_db_from_url():
    """
    Test creation of FeatureDB from URL iterator.
    """
    print("Testing creation of DB from URL iterator")
    # initially run SimpleHTTPServer at port 0 and os will take first available
    Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
    httpd = SocketServer.TCPServer(("", 0), Handler)
    port = str(httpd.socket.getsockname()[1])
    print("serving at port", port)

    # Serving test/data folder
    served_folder = gffutils.example_filename('')
    os.chdir(served_folder)

    print("Starting SimpleHTTPServer in thread")
    server_thread = threading.Thread(target=httpd.serve_forever)
    server_thread.deamon = True
    server_thread.start()
    try:
        url = ''.join(['http://localhost:', port, '/gff_example1.gff3'])
        db = gffutils.create_db(url, ":memory:", keep_order=True)
        def my_iterator():
            for rec in db.all_features():
                yield rec
        new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True)

        print(list(new_db.all_features()))
        gene_feats = new_db.all_features(featuretype="gene")
        assert (len(list(gene_feats)) != 0), "Could not load genes from GFF."
    finally:
        print('Server shutdown.')
        httpd.shutdown()
        server_thread.join()
Beispiel #5
0
def test_delete():
    db_fname = gffutils.example_filename("gff_example1.gff3")

    # incrementally delete all features
    db = gffutils.create_db(db_fname, ':memory:')
    ids = [i.id for i in db.all_features()]
    current = set(ids)
    for _id in ids:
        db.delete(_id)
        expected = current.difference([_id])
        current = set([i.id for i in db.all_features()])
        assert current == expected, (current, expected)
    assert len(current) == 0

    # same thing, but as a list of Feature objects rather than string IDs
    db = gffutils.create_db(db_fname, ':memory:')
    features = list(db.all_features())
    current = set(features)
    for feature in features:
        db.delete(feature)
        expected = current.difference([feature])
        current = set(list(db.all_features()))
        assert current == expected, (current, expected)
    assert len(current) == 0, current

    # same thing, but use a FeatureDB.
    db1 = gffutils.create_db(db_fname, ':memory:')
    db2 = gffutils.create_db(db_fname, ':memory:')
    db1.delete(db2)
    assert len(list(db1.all_features())) == 0


    db = gffutils.create_db(db_fname, ':memory:')
    db.delete('nonexistent')
Beispiel #6
0
def add_EVM(final_update, wd, consensus_mapped_gff3):

    """
    """

    db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")]

    uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap]

    mRNA = []
    for evm in uniq_evm:
        for line in db_evm.parents(evm, order_by='start'):
            mRNA.append(line.attributes["ID"][0])
    mRNA_uniq = list(set(mRNA))
    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in mRNA_uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name
Beispiel #7
0
 def create_from(self, filename, forceCreate=False):
     """Create the gffutils sqLite db for the specified .gff file"""
     gffutils.create_db(filename,
                        dbfn=self.dbname,
                        force=forceCreate,
                        keep_order=False,
                        merge_strategy='merge')
def parsecontextscores(csfile, gff, featurename):
    #Make dictionary of this form:
    # {UTRname : [[UTRlength], [names of all miRNAs that have sites in that UTR]]}
    #csfile = output of targetscan_60_context_scores.pl
    #gff = gff file of regions of interest
    #featurename = feature category in gff file (3rd field)
    lengthdict = {}
    CSdict = {}

    #First need to get lengths
    gff_fn = gff
    db_fn = os.path.basename(gff_fn) + '.db'

    if os.path.isfile(db_fn) == False: #if database doesn't exist, create it
        gffutils.create_db(gff_fn, db_fn)

    db = gffutils.FeatureDB(db_fn)
    features = db.features_of_type(featurename)

    for feature in features:
        featureid = feature.id
        featurelength = feature.stop - feature.start
        lengthdict[featureid] = featurelength

    os.remove(db_fn)

    #Now get miRNA names
    csfilehandle = open(csfile, 'r')
    for line in csfilehandle:
        line = line.strip().split('\t')
        if line[0] != 'Gene ID': #skip header line
            featureid = line[0].split(';')[0] #Remove Parent=...
            species = line[1]
            miRNAname = line[2]
            if species == '10090': #this is mouse; for other species, change this number
                if featureid not in CSdict:
                    CSdict[featureid] = [[lengthdict[featureid]], [miRNAname]]
                elif featureid in CSdict:
                    CSdict[featureid][1].append(miRNAname)

    csfilehandle.close()
            
    return CSdict
Beispiel #9
0
def test_disable_infer():
    """
    tests the new semantics for disabling gene/transcript inference
    """
    # To start, we construct a GTF db by inferring genes and transcripts
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'),
                            ':memory:')

    # Then create a file missing transcripts, and another missing genes.
    import tempfile
    tempfile.tempdir = None
    no_transcripts = open(tempfile.NamedTemporaryFile(delete=False).name, 'w')
    no_genes = open(tempfile.NamedTemporaryFile(delete=False).name, 'w')
    for feature in db.all_features():
        if feature.featuretype != 'transcript':
            no_transcripts.write(str(feature) + '\n')
        if feature.featuretype != 'gene':
            no_genes.write(str(feature) + '\n')
    no_genes.close()
    no_transcripts.close()

    no_tx_db = gffutils.create_db(no_transcripts.name,
                                  ':memory:',
                                  disable_infer_transcripts=True)
    no_gn_db = gffutils.create_db(no_genes.name,
                                  ':memory:',
                                  disable_infer_genes=True)
    no_xx_db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'),
                                  ':memory:',
                                  disable_infer_genes=True,
                                  disable_infer_transcripts=True)

    # no transcripts but 3 genes
    assert len(list(no_tx_db.features_of_type('transcript'))) == 0
    assert len(list(no_tx_db.features_of_type('gene'))) == 3

    # no genes but 4 transcripts
    assert len(list(no_gn_db.features_of_type('gene'))) == 0
    assert len(list(no_gn_db.features_of_type('transcript'))) == 4

    # no genes or transcripts
    assert len(list(no_xx_db.features_of_type('gene'))) == 0
    assert len(list(no_xx_db.features_of_type('transcript'))) == 0
def _get_gtf_db(gtf):
    db_file = gtf + ".db"
    if not file_exists(db_file):
        print("Creating gffutils database for %s." % (gtf))
        disable_infer_transcripts, disable_infer_genes = guess_disable_infer_extent(
            gtf)
        if not disable_infer_transcripts or not disable_infer_genes:
            print("'transcript' or 'gene' entries not found, so inferring "
                  "their extent. This can be very slow.")
        id_spec = guess_id_spec(gtf)
        gffutils.create_db(gtf,
                           dbfn=db_file,
                           disable_infer_genes=disable_infer_genes,
                           disable_infer_transcripts=disable_infer_transcripts,
                           id_spec=id_spec,
                           merge_strategy="create_unique",
                           keep_order=True,
                           verbose=True)
    return gffutils.FeatureDB(db_file)
Beispiel #11
0
def create_database(gff_file, database_name):
    _ = gffutils.create_db(gff_file,
                           dbfn=f"{database_name}.db",
                           force=True,
                           keep_order=True,
                           merge_strategy="merge",
                           sort_attribute_values=True,
                           verbose=True)

    return
Beispiel #12
0
def getexoniccoords(posfactors, gff):
    #Take posfactors dictionary
    #Take the transcripts for each gene, get their exonic coords.
    #if txend of a transcript is exonic for every transcript that has a higher PF, then this gene is all TUTR
    #if all txends are not exonic in any other transcript, then this gene is all ALE
    #if neither of these are true, then the gene is somehow mixed
    #posfactors = {ENSMUSG : {ENSMUST : positionfactor}}
    #Make gff database
    print 'Indexing gff...'
    gff_fn = gff
    db_fn = os.path.abspath(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True)

    db = gffutils.FeatureDB(db_fn)
    print 'Done indexing!'

    exoniccoords = {
    }  #{geneid : {txid : [positionfactor, [set of exonic coords]]}}
    for gene in posfactors:
        txs = posfactors[gene].keys()
        geneexons = {}  #{txid : [positionfactor, [set of exonic coords]]}
        for tx in txs:
            txexons = []
            pf = posfactors[gene][tx]
            tx = db['transcript:' + tx]
            if tx.strand == '+':
                for exon in db.children(tx,
                                        featuretype='exon',
                                        order_by='start'):
                    txexons += range(exon.start, exon.end + 1)
            elif tx.strand == '-':
                for exon in db.children(tx,
                                        featuretype='exon',
                                        order_by='start',
                                        reverse=True):
                    txexons += range(exon.start, exon.end + 1)

            geneexons[tx.id] = [pf, set(txexons)]

        exoniccoords[gene] = geneexons

    return exoniccoords
 def parse_gff3(self):
     print("-------- Ensembl data Parsing --------")
     print("\tParsing gff3 file...")
     print("\tcreating temporary database from file: " + self.gff)
     fn = gffutils.example_filename(self.gff)
     db = gffutils.create_db(fn, ":memory:", merge_strategy="create_unique")
     # gffutils.create_db(fn, "DB.Ensembl_" + self.species[0] +".db", merge_strategy="create_unique")
     # db = gffutils.FeatureDB("DB.Ensembl_" + self.species[0] +".db")
     self.collect_genes(db)
     self.collect_Transcripts(db)
Beispiel #14
0
def test_pr_139():
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),
                            ':memory:')
    exons = list(db.features_of_type('exon'))
    inter = list(db.interfeatures(exons))

    # previously, the first exon's attributes would show up in subsequent merged features
    assert exons[0].attributes['Name'][0] not in inter[1].attributes['Name']
    assert exons[0].attributes['Name'][0] not in inter[2].attributes['Name']
    assert exons[0].attributes['Name'][0] not in inter[3].attributes['Name']
Beispiel #15
0
def restore_gff_db(gtf_fn):
    gtf_db = None
    if gtf_fn is not None:
        gtf_db_fn = gtf_fn + '.gffdb'
        if not os.path.isfile(gtf_db_fn):
            try:
                # check if 'gene' or 'transcript' is in GTF
                disable_gene, disable_trans = False, False
                with open(gtf_fn) as fp:
                    l = 0
                    for line in fp:
                        line = line.rstrip()
                        if len(line) < 1: continue
                        if line[0] != '#':
                            if line.split()[2] == 'gene':
                                disable_gene = True
                            elif line.split()[2] == 'transcript':
                                disable_trans = True
                            l += 1

                        if (disable_gene and disable_trans) or l == 100: break
                ut.err_format_time(
                    'restore_gtf_db',
                    'Creating GTF databases for {} ...'.format(gtf_fn))
                gtf_db = gu.create_db(gtf_fn,
                                      gtf_db_fn,
                                      disable_infer_genes=disable_gene,
                                      disable_infer_transcripts=disable_trans)
                ut.err_format_time(
                    'restore_gtf_db',
                    'Creating GTF databases for {} done!'.format(gtf_fn))

            except:
                ut.err_format_time(
                    'restore_gtf_db',
                    'Error in parsing {}\nCheck if annotation file format is correct'
                    .format(gtf_fn))
                sys.exit(IOError)
        else:
            try:
                ut.err_format_time(
                    'restore_gtf_db',
                    'Retrieving gff database for {} ...'.format(gtf_fn))
                gtf_db = gu.FeatureDB(gtf_db_fn)
                ut.err_format_time(
                    'restore_gtf_db',
                    'Retrieving gff database for {} done!'.format(gtf_fn))

            except:
                ut.err_format_time(
                    'restore_gtf_db',
                    'Error in parsing {}\nTry to remove this db file and re-run'
                    .format(gtf_db_fn))
                sys.exit(IOError)
    return gtf_db
Beispiel #16
0
def _output_gff3(gff3_file, out_file, dialect):
    db = gffutils.create_db(gff3_file, ":memory:")
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in DataIterator(db.features_of_type("exon"), dialect=dialect):
                transcript_id = feature["Parent"][0]
                gene_id = db[transcript_id]["Parent"][0]
                attr = {"transcript_id": transcript_id, "gene_id": gene_id}
                attributes = gffutils.attributes.Attributes(attr)
                feature.attributes = attributes
                print(feature, file=out_handle, end="")
Beispiel #17
0
def create_db(gtf, dbfn):
    """
    From a 'gtf' file, create a 'dbfn' sqlite database.
    """

    logger.info('Creating db')

    gffutils.create_db(
        gtf,
        dbfn=dbfn,
        force=True,  # Delete db if already exist
        merge_strategy='merge',
        id_spec={
            'exon': 'exon_id',
            'gene': 'gene_id',
            'transcript': 'transcript_id'
        },
        disable_infer_transcripts=True,
        disable_infer_genes=True)
    logger.info('db done')
Beispiel #18
0
def gff_gene_check(GffName, db_name, memory=0):
    # The IDs on the GFFs must be unique. Moreover, because only the gene
    # information are needed, all the other information must be removed
    # from the GFFs.
    tempgff=""
    for line in open(GffName):
        if line[0] != "#":
            if line.split()[2] == "gene":
                tempgff+=line

        else:
            tempgff+=line
    
    if memory:
        # Write the db in memory and return it as variable so it can be used
        # as subclass of _DBCreator
        dbout = gffutils.create_db(tempgff, ":memory:", from_string=True)
        return dbout
    else:
        gffutils.create_db(tempgff, db_name, from_string=True)
Beispiel #19
0
def open_gff_db(gff):
    try:
        db = gffutils.FeatureDB(gff)
    except sqlite3.DatabaseError:
        if os.path.exists(f"{gff}.sqlite3"):
            sys.stderr.write(
                f"File {gff}.sqlite3 exists. Using existing database.\n")
            db = gffutils.FeatureDB(f"{gff}.sqlite3")
        else:
            db = gffutils.create_db(gff, f"{gff}.sqlite3")
    return db
Beispiel #20
0
def get_gtf_db(gtf, in_memory=False):
    """
    create a gffutils DB
    """
    db_file = ":memory:" if in_memory else gtf + ".db"
    if in_memory or not file_exists(db_file):
        db = gffutils.create_db(gtf, dbfn=db_file)
    if in_memory:
        return db
    else:
        return gffutils.FeatureDB(db_file)
Beispiel #21
0
 def __init__(self, file, in_memory=False, db_path=None):
     self.remove_db = False
     if in_memory:
         self.dbfn = ":memory:"
     elif db_path is not None:
         self.dbfn = db_path
     else:
         with tempfile.NamedTemporaryFile() as t:
             self.dbfn = t.name
         self.remove_db = True
     self.db = gffutils.create_db(file, self.dbfn, merge_strategy="error")
def split3UTR(UTR3gff, fragsize, outfile):
    gff_fn = UTR3gff
    print 'Indexing gff...'
    db_fn = os.path.abspath(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True)

    db = gffutils.FeatureDB(db_fn)
    print 'Done indexing!'

    UTR3s = db.features_of_type('UTR3')

    outfh = open(outfile, 'w')

    for UTR3 in UTR3s:
        #Only going to consider single exon UTRs
        if len(list(db.children(UTR3, featuretype='exon', level=1))) > 1:
            continue

        ID = UTR3.attributes['ID'][0]
        parent = UTR3.attributes['Parent'][0]
        gene_id = UTR3.attributes['gene_id'][0]

        coord = UTR3.start
        counter = 1
        while coord <= UTR3.end:
            windowstart = coord
            windowend = coord + fragsize
            idfield = 'ID=' + ID + '.utr3fragment{0}'.format(
                counter) + ';Parent=' + parent + ';gene_id=' + gene_id
            with open(outfile, 'a') as outfh:
                outfh.write(('\t').join([
                    str(UTR3.chrom), 'longest3UTRfrags', 'UTR3frag',
                    str(windowstart),
                    str(windowend), '.',
                    str(UTR3.strand), '.', idfield
                ]) + '\n')
            coord = coord + fragsize + 1
            counter += 1

    os.remove(db_fn)
Beispiel #23
0
def test_iterator_update():
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    assert len(list(db.all_features())) == 12
    orig_exon_coords = set([(i.start, i.stop) for i in db.features_of_type('exon')])


    # reset all features to have the same coords of start=1, stop=100
    def gen():
        for i in db.features_of_type('gene'):
            i.start = 1
            i.stop = 100
            yield i

    db.update(gen(), merge_strategy='replace')
    assert len(list(db.all_features())) == 12
    assert len(list(db.features_of_type('gene'))) == 1
    g = six.next(db.features_of_type('gene'))
    assert g.start == 1, g.start
    assert g.stop == 100, g.stop

    # exons should have remained unchanged.
    assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')])


    def _transform(f):
        f.start = 1
        f.stop = 100
        return f

    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    db.update(db.features_of_type('gene'), merge_strategy='replace', transform=_transform)
    assert len(list(db.all_features())) == 12
    assert len(list(db.features_of_type('gene'))) == 1
    g = six.next(db.features_of_type('gene'))
    assert g.start == 1, g.start
    assert g.stop == 100, g.stop

    # exons should have remained unchanged.
    assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')])
Beispiel #24
0
    def subset_db(self,
                  output_path,
                  genes=None,
                  region_bed=None,
                  span=500000,
                  disable_infer_genes=True,
                  disable_infer_transcripts=True):
        all_features = {}

        if region_bed is not None:
            if isinstance(region_bed, (str, pathlib.Path)):
                region_bed = pd.read_csv(region_bed,
                                         sep='\t',
                                         index_col=0,
                                         header=None)

            for _, (chrom, start, end) in region_bed.iterrows:
                start = max(start - span, 0)
                end = end + span  # it's OK to exceed the chromosome, as here we just select valid gtf features
                features = self.region((chrom, start, end))
                for f in features:
                    all_features[f.id] = f

        if genes is not None:
            for gene in genes:
                feature = self.get_gene_feature(gene)
                start = max(feature.start - span, 0)
                end = feature.end + span
                features = self.region((feature.chrom, start, end))
                for f in features:
                    all_features[f.id] = f

        if len(all_features) == 0:
            print('No features selected.')
            return
        else:
            create_db(list(all_features.values()),
                      dbfn=output_path,
                      disable_infer_genes=disable_infer_genes,
                      disable_infer_transcripts=disable_infer_transcripts)
        return
Beispiel #25
0
def get_gtf_db(gtf, in_memory=False):
    """
    create a gffutils DB from a GTF file and will use an existing gffutils
    database if it is named {gtf}.db
    """
    db_file = ":memory:" if in_memory else gtf + ".db"
    if in_memory or not os.path.exists(db_file):
        db = gffutils.create_db(gtf, dbfn=db_file, disable_infer_genes=True)
    if in_memory:
        return db
    else:
        return gffutils.FeatureDB(db_file)
def connectgffdb(gff):
    gffdb = gff + ".db"
    if path.exists(gffdb):
        dbc = gffutils.FeatureDB(gffdb)
    else:
        dbc = gffutils.create_db(gff,
                                 gff + '.db',
                                 disable_infer_transcripts=True,
                                 disable_infer_genes=True,
                                 keep_order=True,
                                 verbose=False)
    return dbc
Beispiel #27
0
def GenerateExonIntervalTree(gtf_file,
                             overhang=(100, 100),  # overhang from the exon
                             gtf_db_path=":memory:",
                             out_file=None,
                             disable_infer_transcripts=True,
                             disable_infer_genes=True,
                             firstLastNoExtend=True,
                             source_filter=None):
    """
    Build IntervalTree object from gtf file for one feature unit (e.g. gene, exon)
    If give out_file, pickle it
    gtf_file: gtf format file or pickled Intervaltree object.
    overhang: flanking intron length to take along with exon. Corresponding to left (acceptor side) and right (donor side)
    gtf_db_path: (optional) gtf database path. Database for one gtf file only need to be created once
    out_file: (optional) file path to store the pickled Intervaltree obejct. Next time run it can be given to `gtf_file`
    disable_infer_transcripts: option to disable infering transcripts. Can be True if the gtf file has transcripts annotated.
    disable_infer_genes: option to disable infering genes. Can be True if the gtf file has genes annotated.
    firstLastNoExtend: if True, overhang is not taken for 5' of the first exon, or 3' of the last exon of a gene.
    source_filter: gene source filters, such as "protein_coding" filter for protein coding genes
    """
    try:
        gtf_db = gffutils.interface.FeatureDB(gtf_db_path)
    except ValueError:
        gtf_db = gffutils.create_db(
            gtf_file,
            gtf_db_path,
            disable_infer_transcripts=disable_infer_transcripts,
            disable_infer_genes=disable_infer_genes)

    genes = gtf_db.features_of_type('gene')
    exonTree = IntervalTree()
    default_overhang = overhang
    for gene in genes:
        if source_filter is not None:
            if gene.source != source_filter:
                continue
        for exon in gtf_db.children(gene, featuretype='exon'):
            isLast = False  # track whether is last exon
            if firstLastNoExtend:
                if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start):
                    overhang = (overhang[0], 0)
                    isLast = True
                elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end):
                    # int(exon.attributes['exon_number'][0]) == 1:
                    overhang = (0, overhang[1])
            iv = ExonInterval.from_Feature(exon, overhang)
            iv.isLast = isLast
            overhang = default_overhang
            exonTree.insert(iv)
    if out_file is not None:
        with open(out_file, 'wb') as f:
            pickle.dump(exonTree, f)
    return exonTree
Beispiel #28
0
def CreateDB(FileNameGFF, FileNameDB):
    '''
    This function creates a GFF database
    '''

    db = gffutils.create_db(FileNameGFF,
                            dbfn=FileNameDB,
                            force=True,
                            keep_order=True,
                            merge_strategy='merge',
                            sort_attribute_values=True)
    return
def update_gff_db(strain_gff,db_out_dir):
    def gff_update_iterator(db):
        for gene in db.features_of_type('gene'):

            # modify attributes

            # add a new attribute for exon id
            for target_id in gene.attributes["Target"][0].split(","):
                gene.attributes['ID'] = target_id.split(" ")[0]+"_"+gene.chrom
                gene.attributes['Target']=target_id
                yield gene

    try:
        gffutils.create_db(
        str(strain_gff),
        dbfn=str(db_out_dir/(strain_gff.stem+".db"))
        )
    except:
        pass
    db = gffutils.FeatureDB(str(db_out_dir/(strain_gff.stem+".db")))
    db.update(gff_update_iterator(db),merge_strategy="create_unique")
Beispiel #30
0
    def parse_input(self):
        import gffutils

        # Check if database file already exists
        if os.path.exists(self.database):
            self.db = gffutils.FeatureDB(self.database, keep_order=True)
        else:
            self.db = gffutils.create_db(self.input,
                                         dbfn=self.database,
                                         force=True,
                                         keep_order=True,
                                         sort_attribute_values=True)
Beispiel #31
0
def UTR3spergene(gff):
	utrs = {} #{geneid: [[UTRstart, UTRstop], [UTRstart, UTRstop]]}
	print 'Indexing gff...'
	gff_fn = gff
	db_fn = os.path.basename(gff_fn) + '.db'
	if os.path.isfile(db_fn) == False:
		gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)

	db = gffutils.FeatureDB(db_fn)
	print 'Done indexing!'

	genes = db.features_of_type('gene')
	for gene in genes:
		geneid = str(gene.id).replace('gene:', '')
		if geneid not in utrs:
			utrs[geneid] = []
		for utr in db.children(gene, level = 1, featuretype = 'UTR3'):
			if [utr.start, utr.end] not in utrs[geneid]:
				utrs[geneid].append([utr.start, utr.end])

	return utrs
Beispiel #32
0
 def process_loc_for_gff(self, zin, gff_fname, assm_acc, seq_acc, start, stop, extra_fields):
     with tempfile.NamedTemporaryFile() as tmpfile:
         tmpfile.write(zin.read(gff_fname))
         db = gffutils.create_db(
             tmpfile.name,
             dbfn=':memory:',
             force=True,
             keep_order=True,
             merge_strategy='merge',
             sort_attribute_values=True
         )
         find_genes_by_loc(db, self.writer, assm_acc, seq_acc, start, stop, extra_fields)
Beispiel #33
0
def gff2bed(gfffile, dbname, feature='gene', target='ID'):
    # try to create database, but if it already exists,
    # it will use the existing one.
    try:
        db = gffutils.create_db(gfffile, dbname)
    except sqlite3.OperationalError:
        db = gffutils.FeatureDB(dbname)

    for item in db.features_of_type(feature):
        tmp, = item[target]
        out = [item.seqid, item.start, item.end, tmp]
        print("\t".join(map(str, out)))
Beispiel #34
0
    def parse_gtf(gene_annotations_gtf, gtf_db_file):
        """
    Convert GTF file into a FeatureDB
    :param gene_annotations_gtf:
    :param gtf_db_file:
    :return:
    """
        try:
            gffutils.create_db(gene_annotations_gtf,
                               gtf_db_file,
                               merge_strategy="create_unique",
                               keep_order=True,
                               disable_infer_transcripts=True,
                               disable_infer_genes=True,
                               verbose=True,
                               force=False)
        except Exception as e:  # already exists
            print("Databae already exists" + str(e), gtf_db_file)

        db = gffutils.FeatureDB(gtf_db_file)
        return db
Beispiel #35
0
def read_index(gff_file, inmemory=False):
    """
    Read in a gffutils index for fast retrieval of features.
    """
    import gffutils
    from subprocess import call

    gff_file_db = "{0}.db".format(gff_file)
    gff_file_db_gz = "{0}.gz".format(gff_file_db)

    if inmemory:
        return gffutils.create_db(gff_file, ':memory:')

    if op.exists(gff_file_db_gz):
        call('gunzip {0}'.format(gff_file_db_gz), \
            shell=True, executable='/bin/bash')

    if op.exists(gff_file_db):
        return gffutils.FeatureDB(gff_file_db)

    return gffutils.create_db(gff_file, gff_file_db)
Beispiel #36
0
def main():
    args = argparser()
    try:
        db = gffutils.create_db(args.annotation, dbfn=args.database, force=False, keep_order=False, merge_strategy='merge', sort_attribute_values=False)
    except OperationalError:
        db = gffutils.FeatureDB(args.database)
    if args.verbose:
        print("Parsing variant file")
    variants, novariant_depth = parse_annvars(args.mutation)
    if args.verbose:
        print("Combining variants with ontology database")
    combine_annotations(variants, db, novariant_depth, outf = args.output, verbose = args.verbose)
def generate_transcriptome(ANNOT, fasta, gff, ID, ffn):
	db = gffutils.create_db(gff, "tmp.db", merge_strategy = "create_unique", force = True)
	fasta = pyfaidx.Fasta(fasta)
	results = open(ffn, "w")

	for cds in db.features_of_type(ANNOT, order_by = "start"):
		gff_id = ''.join(cds[ID]).strip()
		fasta_sequence = cds.sequence(fasta)
		clean_sequence = fasta_wrapper.fasta_wrapper(fasta_sequence, 60)
		results.write(">" + gff_id + "\n" + clean_sequence + "\n")
	
	results.close()