def make_db(): """ Create gffutils database """ size, md5, fn = DB if not _up_to_date(md5, fn): gffutils.create_db(fn.replace('.db', ''), fn, verbose=True, force=True)
def add_removed_evm(pasa, exon, wd): """ here the clusters of sequence from the same locus are prepared """ db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True) ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")] db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True) ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")] ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")] uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap] uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)] uniq = list(set(uniq_gene)) outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile.name) for name in uniq: for i in db_evm.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_evm[name]) for name in ids_gmap_full: for i in db_gmap.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_gmap[name]) gff_out_s.close() return outfile.name
def make_db(i): """ Module-level function that can be pickled across processes for multiprocessing testing. """ gffutils.create_db(fn, ':memory:', _keep_tempfiles='.%s' % i) return i
def test_create_db_from_url(): """ Test creation of FeatureDB from URL iterator. """ print("Testing creation of DB from URL iterator") # initially run SimpleHTTPServer at port 0 and os will take first available Handler = SimpleHTTPServer.SimpleHTTPRequestHandler httpd = SocketServer.TCPServer(("", 0), Handler) port = str(httpd.socket.getsockname()[1]) print("serving at port", port) # Serving test/data folder served_folder = gffutils.example_filename('') os.chdir(served_folder) print("Starting SimpleHTTPServer in thread") server_thread = threading.Thread(target=httpd.serve_forever) server_thread.deamon = True server_thread.start() try: url = ''.join(['http://localhost:', port, '/gff_example1.gff3']) db = gffutils.create_db(url, ":memory:", keep_order=True) def my_iterator(): for rec in db.all_features(): yield rec new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True) print(list(new_db.all_features())) gene_feats = new_db.all_features(featuretype="gene") assert (len(list(gene_feats)) != 0), "Could not load genes from GFF." finally: print('Server shutdown.') httpd.shutdown() server_thread.join()
def test_delete(): db_fname = gffutils.example_filename("gff_example1.gff3") # incrementally delete all features db = gffutils.create_db(db_fname, ':memory:') ids = [i.id for i in db.all_features()] current = set(ids) for _id in ids: db.delete(_id) expected = current.difference([_id]) current = set([i.id for i in db.all_features()]) assert current == expected, (current, expected) assert len(current) == 0 # same thing, but as a list of Feature objects rather than string IDs db = gffutils.create_db(db_fname, ':memory:') features = list(db.all_features()) current = set(features) for feature in features: db.delete(feature) expected = current.difference([feature]) current = set(list(db.all_features())) assert current == expected, (current, expected) assert len(current) == 0, current # same thing, but use a FeatureDB. db1 = gffutils.create_db(db_fname, ':memory:') db2 = gffutils.create_db(db_fname, ':memory:') db1.delete(db2) assert len(list(db1.all_features())) == 0 db = gffutils.create_db(db_fname, ':memory:') db.delete('nonexistent')
def add_EVM(final_update, wd, consensus_mapped_gff3): """ """ db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True) ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")] db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True) ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")] ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")] uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap] mRNA = [] for evm in uniq_evm: for line in db_evm.parents(evm, order_by='start'): mRNA.append(line.attributes["ID"][0]) mRNA_uniq = list(set(mRNA)) outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile.name) for name in mRNA_uniq: for i in db_evm.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_evm[name]) for name in ids_gmap_full: for i in db_gmap.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_gmap[name]) gff_out_s.close() return outfile.name
def create_from(self, filename, forceCreate=False): """Create the gffutils sqLite db for the specified .gff file""" gffutils.create_db(filename, dbfn=self.dbname, force=forceCreate, keep_order=False, merge_strategy='merge')
def parsecontextscores(csfile, gff, featurename): #Make dictionary of this form: # {UTRname : [[UTRlength], [names of all miRNAs that have sites in that UTR]]} #csfile = output of targetscan_60_context_scores.pl #gff = gff file of regions of interest #featurename = feature category in gff file (3rd field) lengthdict = {} CSdict = {} #First need to get lengths gff_fn = gff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: #if database doesn't exist, create it gffutils.create_db(gff_fn, db_fn) db = gffutils.FeatureDB(db_fn) features = db.features_of_type(featurename) for feature in features: featureid = feature.id featurelength = feature.stop - feature.start lengthdict[featureid] = featurelength os.remove(db_fn) #Now get miRNA names csfilehandle = open(csfile, 'r') for line in csfilehandle: line = line.strip().split('\t') if line[0] != 'Gene ID': #skip header line featureid = line[0].split(';')[0] #Remove Parent=... species = line[1] miRNAname = line[2] if species == '10090': #this is mouse; for other species, change this number if featureid not in CSdict: CSdict[featureid] = [[lengthdict[featureid]], [miRNAname]] elif featureid in CSdict: CSdict[featureid][1].append(miRNAname) csfilehandle.close() return CSdict
def test_disable_infer(): """ tests the new semantics for disabling gene/transcript inference """ # To start, we construct a GTF db by inferring genes and transcripts db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'), ':memory:') # Then create a file missing transcripts, and another missing genes. import tempfile tempfile.tempdir = None no_transcripts = open(tempfile.NamedTemporaryFile(delete=False).name, 'w') no_genes = open(tempfile.NamedTemporaryFile(delete=False).name, 'w') for feature in db.all_features(): if feature.featuretype != 'transcript': no_transcripts.write(str(feature) + '\n') if feature.featuretype != 'gene': no_genes.write(str(feature) + '\n') no_genes.close() no_transcripts.close() no_tx_db = gffutils.create_db(no_transcripts.name, ':memory:', disable_infer_transcripts=True) no_gn_db = gffutils.create_db(no_genes.name, ':memory:', disable_infer_genes=True) no_xx_db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'), ':memory:', disable_infer_genes=True, disable_infer_transcripts=True) # no transcripts but 3 genes assert len(list(no_tx_db.features_of_type('transcript'))) == 0 assert len(list(no_tx_db.features_of_type('gene'))) == 3 # no genes but 4 transcripts assert len(list(no_gn_db.features_of_type('gene'))) == 0 assert len(list(no_gn_db.features_of_type('transcript'))) == 4 # no genes or transcripts assert len(list(no_xx_db.features_of_type('gene'))) == 0 assert len(list(no_xx_db.features_of_type('transcript'))) == 0
def _get_gtf_db(gtf): db_file = gtf + ".db" if not file_exists(db_file): print("Creating gffutils database for %s." % (gtf)) disable_infer_transcripts, disable_infer_genes = guess_disable_infer_extent( gtf) if not disable_infer_transcripts or not disable_infer_genes: print("'transcript' or 'gene' entries not found, so inferring " "their extent. This can be very slow.") id_spec = guess_id_spec(gtf) gffutils.create_db(gtf, dbfn=db_file, disable_infer_genes=disable_infer_genes, disable_infer_transcripts=disable_infer_transcripts, id_spec=id_spec, merge_strategy="create_unique", keep_order=True, verbose=True) return gffutils.FeatureDB(db_file)
def create_database(gff_file, database_name): _ = gffutils.create_db(gff_file, dbfn=f"{database_name}.db", force=True, keep_order=True, merge_strategy="merge", sort_attribute_values=True, verbose=True) return
def getexoniccoords(posfactors, gff): #Take posfactors dictionary #Take the transcripts for each gene, get their exonic coords. #if txend of a transcript is exonic for every transcript that has a higher PF, then this gene is all TUTR #if all txends are not exonic in any other transcript, then this gene is all ALE #if neither of these are true, then the gene is somehow mixed #posfactors = {ENSMUSG : {ENSMUST : positionfactor}} #Make gff database print 'Indexing gff...' gff_fn = gff db_fn = os.path.abspath(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True) db = gffutils.FeatureDB(db_fn) print 'Done indexing!' exoniccoords = { } #{geneid : {txid : [positionfactor, [set of exonic coords]]}} for gene in posfactors: txs = posfactors[gene].keys() geneexons = {} #{txid : [positionfactor, [set of exonic coords]]} for tx in txs: txexons = [] pf = posfactors[gene][tx] tx = db['transcript:' + tx] if tx.strand == '+': for exon in db.children(tx, featuretype='exon', order_by='start'): txexons += range(exon.start, exon.end + 1) elif tx.strand == '-': for exon in db.children(tx, featuretype='exon', order_by='start', reverse=True): txexons += range(exon.start, exon.end + 1) geneexons[tx.id] = [pf, set(txexons)] exoniccoords[gene] = geneexons return exoniccoords
def parse_gff3(self): print("-------- Ensembl data Parsing --------") print("\tParsing gff3 file...") print("\tcreating temporary database from file: " + self.gff) fn = gffutils.example_filename(self.gff) db = gffutils.create_db(fn, ":memory:", merge_strategy="create_unique") # gffutils.create_db(fn, "DB.Ensembl_" + self.species[0] +".db", merge_strategy="create_unique") # db = gffutils.FeatureDB("DB.Ensembl_" + self.species[0] +".db") self.collect_genes(db) self.collect_Transcripts(db)
def test_pr_139(): db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ':memory:') exons = list(db.features_of_type('exon')) inter = list(db.interfeatures(exons)) # previously, the first exon's attributes would show up in subsequent merged features assert exons[0].attributes['Name'][0] not in inter[1].attributes['Name'] assert exons[0].attributes['Name'][0] not in inter[2].attributes['Name'] assert exons[0].attributes['Name'][0] not in inter[3].attributes['Name']
def restore_gff_db(gtf_fn): gtf_db = None if gtf_fn is not None: gtf_db_fn = gtf_fn + '.gffdb' if not os.path.isfile(gtf_db_fn): try: # check if 'gene' or 'transcript' is in GTF disable_gene, disable_trans = False, False with open(gtf_fn) as fp: l = 0 for line in fp: line = line.rstrip() if len(line) < 1: continue if line[0] != '#': if line.split()[2] == 'gene': disable_gene = True elif line.split()[2] == 'transcript': disable_trans = True l += 1 if (disable_gene and disable_trans) or l == 100: break ut.err_format_time( 'restore_gtf_db', 'Creating GTF databases for {} ...'.format(gtf_fn)) gtf_db = gu.create_db(gtf_fn, gtf_db_fn, disable_infer_genes=disable_gene, disable_infer_transcripts=disable_trans) ut.err_format_time( 'restore_gtf_db', 'Creating GTF databases for {} done!'.format(gtf_fn)) except: ut.err_format_time( 'restore_gtf_db', 'Error in parsing {}\nCheck if annotation file format is correct' .format(gtf_fn)) sys.exit(IOError) else: try: ut.err_format_time( 'restore_gtf_db', 'Retrieving gff database for {} ...'.format(gtf_fn)) gtf_db = gu.FeatureDB(gtf_db_fn) ut.err_format_time( 'restore_gtf_db', 'Retrieving gff database for {} done!'.format(gtf_fn)) except: ut.err_format_time( 'restore_gtf_db', 'Error in parsing {}\nTry to remove this db file and re-run' .format(gtf_db_fn)) sys.exit(IOError) return gtf_db
def _output_gff3(gff3_file, out_file, dialect): db = gffutils.create_db(gff3_file, ":memory:") with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in DataIterator(db.features_of_type("exon"), dialect=dialect): transcript_id = feature["Parent"][0] gene_id = db[transcript_id]["Parent"][0] attr = {"transcript_id": transcript_id, "gene_id": gene_id} attributes = gffutils.attributes.Attributes(attr) feature.attributes = attributes print(feature, file=out_handle, end="")
def create_db(gtf, dbfn): """ From a 'gtf' file, create a 'dbfn' sqlite database. """ logger.info('Creating db') gffutils.create_db( gtf, dbfn=dbfn, force=True, # Delete db if already exist merge_strategy='merge', id_spec={ 'exon': 'exon_id', 'gene': 'gene_id', 'transcript': 'transcript_id' }, disable_infer_transcripts=True, disable_infer_genes=True) logger.info('db done')
def gff_gene_check(GffName, db_name, memory=0): # The IDs on the GFFs must be unique. Moreover, because only the gene # information are needed, all the other information must be removed # from the GFFs. tempgff="" for line in open(GffName): if line[0] != "#": if line.split()[2] == "gene": tempgff+=line else: tempgff+=line if memory: # Write the db in memory and return it as variable so it can be used # as subclass of _DBCreator dbout = gffutils.create_db(tempgff, ":memory:", from_string=True) return dbout else: gffutils.create_db(tempgff, db_name, from_string=True)
def open_gff_db(gff): try: db = gffutils.FeatureDB(gff) except sqlite3.DatabaseError: if os.path.exists(f"{gff}.sqlite3"): sys.stderr.write( f"File {gff}.sqlite3 exists. Using existing database.\n") db = gffutils.FeatureDB(f"{gff}.sqlite3") else: db = gffutils.create_db(gff, f"{gff}.sqlite3") return db
def get_gtf_db(gtf, in_memory=False): """ create a gffutils DB """ db_file = ":memory:" if in_memory else gtf + ".db" if in_memory or not file_exists(db_file): db = gffutils.create_db(gtf, dbfn=db_file) if in_memory: return db else: return gffutils.FeatureDB(db_file)
def __init__(self, file, in_memory=False, db_path=None): self.remove_db = False if in_memory: self.dbfn = ":memory:" elif db_path is not None: self.dbfn = db_path else: with tempfile.NamedTemporaryFile() as t: self.dbfn = t.name self.remove_db = True self.db = gffutils.create_db(file, self.dbfn, merge_strategy="error")
def split3UTR(UTR3gff, fragsize, outfile): gff_fn = UTR3gff print 'Indexing gff...' db_fn = os.path.abspath(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True) db = gffutils.FeatureDB(db_fn) print 'Done indexing!' UTR3s = db.features_of_type('UTR3') outfh = open(outfile, 'w') for UTR3 in UTR3s: #Only going to consider single exon UTRs if len(list(db.children(UTR3, featuretype='exon', level=1))) > 1: continue ID = UTR3.attributes['ID'][0] parent = UTR3.attributes['Parent'][0] gene_id = UTR3.attributes['gene_id'][0] coord = UTR3.start counter = 1 while coord <= UTR3.end: windowstart = coord windowend = coord + fragsize idfield = 'ID=' + ID + '.utr3fragment{0}'.format( counter) + ';Parent=' + parent + ';gene_id=' + gene_id with open(outfile, 'a') as outfh: outfh.write(('\t').join([ str(UTR3.chrom), 'longest3UTRfrags', 'UTR3frag', str(windowstart), str(windowend), '.', str(UTR3.strand), '.', idfield ]) + '\n') coord = coord + fragsize + 1 counter += 1 os.remove(db_fn)
def test_iterator_update(): db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ':memory:') assert len(list(db.all_features())) == 12 orig_exon_coords = set([(i.start, i.stop) for i in db.features_of_type('exon')]) # reset all features to have the same coords of start=1, stop=100 def gen(): for i in db.features_of_type('gene'): i.start = 1 i.stop = 100 yield i db.update(gen(), merge_strategy='replace') assert len(list(db.all_features())) == 12 assert len(list(db.features_of_type('gene'))) == 1 g = six.next(db.features_of_type('gene')) assert g.start == 1, g.start assert g.stop == 100, g.stop # exons should have remained unchanged. assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')]) def _transform(f): f.start = 1 f.stop = 100 return f db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ':memory:') db.update(db.features_of_type('gene'), merge_strategy='replace', transform=_transform) assert len(list(db.all_features())) == 12 assert len(list(db.features_of_type('gene'))) == 1 g = six.next(db.features_of_type('gene')) assert g.start == 1, g.start assert g.stop == 100, g.stop # exons should have remained unchanged. assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')])
def subset_db(self, output_path, genes=None, region_bed=None, span=500000, disable_infer_genes=True, disable_infer_transcripts=True): all_features = {} if region_bed is not None: if isinstance(region_bed, (str, pathlib.Path)): region_bed = pd.read_csv(region_bed, sep='\t', index_col=0, header=None) for _, (chrom, start, end) in region_bed.iterrows: start = max(start - span, 0) end = end + span # it's OK to exceed the chromosome, as here we just select valid gtf features features = self.region((chrom, start, end)) for f in features: all_features[f.id] = f if genes is not None: for gene in genes: feature = self.get_gene_feature(gene) start = max(feature.start - span, 0) end = feature.end + span features = self.region((feature.chrom, start, end)) for f in features: all_features[f.id] = f if len(all_features) == 0: print('No features selected.') return else: create_db(list(all_features.values()), dbfn=output_path, disable_infer_genes=disable_infer_genes, disable_infer_transcripts=disable_infer_transcripts) return
def get_gtf_db(gtf, in_memory=False): """ create a gffutils DB from a GTF file and will use an existing gffutils database if it is named {gtf}.db """ db_file = ":memory:" if in_memory else gtf + ".db" if in_memory or not os.path.exists(db_file): db = gffutils.create_db(gtf, dbfn=db_file, disable_infer_genes=True) if in_memory: return db else: return gffutils.FeatureDB(db_file)
def connectgffdb(gff): gffdb = gff + ".db" if path.exists(gffdb): dbc = gffutils.FeatureDB(gffdb) else: dbc = gffutils.create_db(gff, gff + '.db', disable_infer_transcripts=True, disable_infer_genes=True, keep_order=True, verbose=False) return dbc
def GenerateExonIntervalTree(gtf_file, overhang=(100, 100), # overhang from the exon gtf_db_path=":memory:", out_file=None, disable_infer_transcripts=True, disable_infer_genes=True, firstLastNoExtend=True, source_filter=None): """ Build IntervalTree object from gtf file for one feature unit (e.g. gene, exon) If give out_file, pickle it gtf_file: gtf format file or pickled Intervaltree object. overhang: flanking intron length to take along with exon. Corresponding to left (acceptor side) and right (donor side) gtf_db_path: (optional) gtf database path. Database for one gtf file only need to be created once out_file: (optional) file path to store the pickled Intervaltree obejct. Next time run it can be given to `gtf_file` disable_infer_transcripts: option to disable infering transcripts. Can be True if the gtf file has transcripts annotated. disable_infer_genes: option to disable infering genes. Can be True if the gtf file has genes annotated. firstLastNoExtend: if True, overhang is not taken for 5' of the first exon, or 3' of the last exon of a gene. source_filter: gene source filters, such as "protein_coding" filter for protein coding genes """ try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except ValueError: gtf_db = gffutils.create_db( gtf_file, gtf_db_path, disable_infer_transcripts=disable_infer_transcripts, disable_infer_genes=disable_infer_genes) genes = gtf_db.features_of_type('gene') exonTree = IntervalTree() default_overhang = overhang for gene in genes: if source_filter is not None: if gene.source != source_filter: continue for exon in gtf_db.children(gene, featuretype='exon'): isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): # int(exon.attributes['exon_number'][0]) == 1: overhang = (0, overhang[1]) iv = ExonInterval.from_Feature(exon, overhang) iv.isLast = isLast overhang = default_overhang exonTree.insert(iv) if out_file is not None: with open(out_file, 'wb') as f: pickle.dump(exonTree, f) return exonTree
def CreateDB(FileNameGFF, FileNameDB): ''' This function creates a GFF database ''' db = gffutils.create_db(FileNameGFF, dbfn=FileNameDB, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) return
def update_gff_db(strain_gff,db_out_dir): def gff_update_iterator(db): for gene in db.features_of_type('gene'): # modify attributes # add a new attribute for exon id for target_id in gene.attributes["Target"][0].split(","): gene.attributes['ID'] = target_id.split(" ")[0]+"_"+gene.chrom gene.attributes['Target']=target_id yield gene try: gffutils.create_db( str(strain_gff), dbfn=str(db_out_dir/(strain_gff.stem+".db")) ) except: pass db = gffutils.FeatureDB(str(db_out_dir/(strain_gff.stem+".db"))) db.update(gff_update_iterator(db),merge_strategy="create_unique")
def parse_input(self): import gffutils # Check if database file already exists if os.path.exists(self.database): self.db = gffutils.FeatureDB(self.database, keep_order=True) else: self.db = gffutils.create_db(self.input, dbfn=self.database, force=True, keep_order=True, sort_attribute_values=True)
def UTR3spergene(gff): utrs = {} #{geneid: [[UTRstart, UTRstop], [UTRstart, UTRstop]]} print 'Indexing gff...' gff_fn = gff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True) db = gffutils.FeatureDB(db_fn) print 'Done indexing!' genes = db.features_of_type('gene') for gene in genes: geneid = str(gene.id).replace('gene:', '') if geneid not in utrs: utrs[geneid] = [] for utr in db.children(gene, level = 1, featuretype = 'UTR3'): if [utr.start, utr.end] not in utrs[geneid]: utrs[geneid].append([utr.start, utr.end]) return utrs
def process_loc_for_gff(self, zin, gff_fname, assm_acc, seq_acc, start, stop, extra_fields): with tempfile.NamedTemporaryFile() as tmpfile: tmpfile.write(zin.read(gff_fname)) db = gffutils.create_db( tmpfile.name, dbfn=':memory:', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True ) find_genes_by_loc(db, self.writer, assm_acc, seq_acc, start, stop, extra_fields)
def gff2bed(gfffile, dbname, feature='gene', target='ID'): # try to create database, but if it already exists, # it will use the existing one. try: db = gffutils.create_db(gfffile, dbname) except sqlite3.OperationalError: db = gffutils.FeatureDB(dbname) for item in db.features_of_type(feature): tmp, = item[target] out = [item.seqid, item.start, item.end, tmp] print("\t".join(map(str, out)))
def parse_gtf(gene_annotations_gtf, gtf_db_file): """ Convert GTF file into a FeatureDB :param gene_annotations_gtf: :param gtf_db_file: :return: """ try: gffutils.create_db(gene_annotations_gtf, gtf_db_file, merge_strategy="create_unique", keep_order=True, disable_infer_transcripts=True, disable_infer_genes=True, verbose=True, force=False) except Exception as e: # already exists print("Databae already exists" + str(e), gtf_db_file) db = gffutils.FeatureDB(gtf_db_file) return db
def read_index(gff_file, inmemory=False): """ Read in a gffutils index for fast retrieval of features. """ import gffutils from subprocess import call gff_file_db = "{0}.db".format(gff_file) gff_file_db_gz = "{0}.gz".format(gff_file_db) if inmemory: return gffutils.create_db(gff_file, ':memory:') if op.exists(gff_file_db_gz): call('gunzip {0}'.format(gff_file_db_gz), \ shell=True, executable='/bin/bash') if op.exists(gff_file_db): return gffutils.FeatureDB(gff_file_db) return gffutils.create_db(gff_file, gff_file_db)
def main(): args = argparser() try: db = gffutils.create_db(args.annotation, dbfn=args.database, force=False, keep_order=False, merge_strategy='merge', sort_attribute_values=False) except OperationalError: db = gffutils.FeatureDB(args.database) if args.verbose: print("Parsing variant file") variants, novariant_depth = parse_annvars(args.mutation) if args.verbose: print("Combining variants with ontology database") combine_annotations(variants, db, novariant_depth, outf = args.output, verbose = args.verbose)
def generate_transcriptome(ANNOT, fasta, gff, ID, ffn): db = gffutils.create_db(gff, "tmp.db", merge_strategy = "create_unique", force = True) fasta = pyfaidx.Fasta(fasta) results = open(ffn, "w") for cds in db.features_of_type(ANNOT, order_by = "start"): gff_id = ''.join(cds[ID]).strip() fasta_sequence = cds.sequence(fasta) clean_sequence = fasta_wrapper.fasta_wrapper(fasta_sequence, 60) results.write(">" + gff_id + "\n" + clean_sequence + "\n") results.close()