class TestPerformanceOnMouse(PerformanceTestFeatureDB, unittest.TestCase): ''' Test frequent scenarios on large genome of mouse. ''' gff_file = gffutils.example_filename('gencode.vM8.annotation.gff3') chromsizes_file = gffutils.example_filename('gencode.vM8.chromsizes.txt') gene_list = gffutils.example_filename('gencode.vM8.5000_gene_ids.txt') transcript_list = gffutils.example_filename( 'gencode.vM8.5000_transcript_ids.txt')
def test_valid_line_count(): p = iterators.FileIterator(example_filename('ncbi_gff3.txt')) assert len(list(p)) == 17 p = iterators.FileIterator(example_filename('hybrid1.gff3')) assert len(list(p)) == 6 p = iterators.FileIterator(example_filename('FBgn0031208.gff')) assert len(list(p)) == 27
class TestPerformanceOnSacCer(PerformanceTestFeatureDB, unittest.TestCase): ''' Test frequent scenarios on medium size genome of yeast. ''' gff_file = gffutils.example_filename( 'Saccharomyces_cerevisiae.R64-1-1.83.gff3') chromsizes_file = gffutils.example_filename( 'Saccharomyces_cerevisiae.R64-1-1.83.chromsizes.txt') gene_list = gffutils.example_filename( 'Saccharomyces_cerevisiae.R64-1-1.83.5000_gene_ids.txt') transcript_list = gffutils.example_filename( 'Saccharomyces_cerevisiae.R64-1-1.83.5000_transcript_ids.txt')
def test_clean_gff(): # test the "full" cleaning -- remove some featuretypes, do sanity-checking, # add chr fn = gffutils.example_filename('dirty.gff') gffutils.clean_gff(fn, newfn='cleaned.tmp',featuretypes_to_remove=['pcr_product','protein'],addchr=True) observed = open('cleaned.tmp').readlines() expected = open(gffutils.example_filename('fully-cleaned.gff')).readlines() assert observed==expected os.unlink('cleaned.tmp') gffutils.clean_gff(fn, featuretypes_to_remove=None, sanity_check=False) observed = open(gffutils.example_filename('dirty.gff.cleaned')).read() expected = open(gffutils.example_filename('basic-cleaned.gff')).read() assert observed == expected os.unlink(gffutils.example_filename('dirty.gff.cleaned'))
def test_delete(): db_fname = gffutils.example_filename("gff_example1.gff3") # incrementally delete all features db = gffutils.create_db(db_fname, ':memory:') ids = [i.id for i in db.all_features()] current = set(ids) for _id in ids: db.delete(_id) expected = current.difference([_id]) current = set([i.id for i in db.all_features()]) assert current == expected, (current, expected) assert len(current) == 0 # same thing, but as a list of Feature objects rather than string IDs db = gffutils.create_db(db_fname, ':memory:') features = list(db.all_features()) current = set(features) for feature in features: db.delete(feature) expected = current.difference([feature]) current = set(list(db.all_features())) assert current == expected, (current, expected) assert len(current) == 0, current # same thing, but use a FeatureDB. db1 = gffutils.create_db(db_fname, ':memory:') db2 = gffutils.create_db(db_fname, ':memory:') db1.delete(db2) assert len(list(db1.all_features())) == 0 db = gffutils.create_db(db_fname, ':memory:') db.delete('nonexistent')
def test_empty_superclass_methods(): dbcreator = gffutils.db.DBCreator(gffutils.example_filename('FBgn0031208.gff'), 'empty.db', verbose=False) dbcreator.populate_from_features([]) dbcreator.update_relations() assert os.path.exists('empty.db') assert os.stat('empty.db').st_size == 0 os.unlink('empty.db')
def test_pr_131(): db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ':memory:') # previously would raise ValueError("No lines parsed -- was an empty # file provided?") db2 = db.update([])
def write_isoforms(gff, refs, outfolder): db_name = os.path.join(outfolder, 'database.db') fn = gffutils.example_filename(gff) db = gffutils.create_db(fn, dbfn=db_name, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) db = gffutils.FeatureDB(db_name, keep_order=True) transcripts = {} for gene in db.features_of_type('gene'): ref_id = gene.seqid ref_seq = refs[ref_id] for transcript in db.children(gene, featuretype='transcript', order_by='start'): transcript_seq = [] for j in db.children(transcript, featuretype='exon', order_by='start'): exon_seq = ref_seq[j.start - 1:j.end] transcript_seq.append(exon_seq) transcript_seq = ''.join([e for e in transcript_seq]) transcripts[transcript.id] = transcript_seq return transcripts
def test_create_db_from_url(): """ Test creation of FeatureDB from URL iterator. """ print("Testing creation of DB from URL iterator") # initially run SimpleHTTPServer at port 0 and os will take first available Handler = SimpleHTTPServer.SimpleHTTPRequestHandler httpd = SocketServer.TCPServer(("", 0), Handler) port = str(httpd.socket.getsockname()[1]) print("serving at port", port) # Serving test/data folder served_folder = gffutils.example_filename('') os.chdir(served_folder) print("Starting SimpleHTTPServer in thread") server_thread = threading.Thread(target=httpd.serve_forever) server_thread.deamon = True server_thread.start() try: url = ''.join(['http://localhost:', port, '/gff_example1.gff3']) db = gffutils.create_db(url, ":memory:", keep_order=True) def my_iterator(): for rec in db.all_features(): yield rec new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True) print(list(new_db.all_features())) gene_feats = new_db.all_features(featuretype="gene") assert (len(list(gene_feats)) != 0), "Could not load genes from GFF." finally: print('Server shutdown.') httpd.shutdown() server_thread.join()
def test_infer_gene_extent(): # Before we deprecate this, make sure it still works but emits a warning. with warnings.catch_warnings(record=True) as w: gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'), ':memory:', infer_gene_extent=False) assert len(w) == 1
def test_inspect_featuretypes(): observed = gffutils.inspect_featuretypes(gffutils.example_filename('FBgn0031208.gff')) observed.sort() expected = ['CDS', 'exon', 'five_prime_UTR', 'gene', 'intron', 'mRNA', 'pcr_product', 'protein', 'three_prime_UTR'] print observed print expected assert observed == expected
def test_issue_79(): gtf = gffutils.example_filename('keep-order-test.gtf') db = gffutils.create_db(gtf, 'tmp.db', disable_infer_genes=False, disable_infer_transcripts=False, id_spec={ "gene": "gene_id", "transcript": "transcript_id" }, merge_strategy="create_unique", keep_order=True, force=True) exp = open(gtf).read() obs = '\n'.join([str(i) for i in db.all_features()]) exp_1 = exp.splitlines(True)[0].strip() obs_1 = obs.splitlines(True)[0].strip() print('EXP') print(exp_1) print('OBS') print(obs_1) print('DIFF') print(''.join(difflib.ndiff([exp_1], [obs_1]))) assert obs_1 == exp_1
def create_gff_db(dicoNiourk): print("\x1b[0;38;2;"+dicoNiourk["color"]["light1"]+"m") ; sys.stdout.write("\033[F") dicoNiourk["spinner"].text = " • Create RefSeq DB" dicoNiourk["spinner"].start() fn = gffutils.example_filename(dicoNiourk["refseq_gff"]) gffutils.create_db(fn, dbfn=dicoNiourk["refseq_gff"].replace(".gff",".db"), force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) dicoNiourk["spinner"].stop() printcolor(" • RefSeq DB created\n","0",dicoNiourk["color"]["light1"],None,dicoNiourk["color"]["bool"])
def convertGffToBedGffUtils(gffFile): fn = gffutils.example_filename(gffFile) # db = gffutils.create_db(fn, dbfn=gffFile[:-4] + '.db', force=True, keep_order=True, \ # merge_strategy='merge', sort_attribute_values=True) db = gffutils.FeatureDB(gffFile[:-4] + '.db', keep_order=True) gffIterator = db.all_features(order_by='start') bedVersion = pybedtools_integration.to_bedtool(gffIterator) print bedVersion
def test_deprecation_handler(): return # TODO: when infer_gene_extent actually gets deprecated, test here. assert_raises(ValueError, gffutils.create_db, gffutils.example_filename('FBgn0031208.gtf'), ':memory:', infer_gene_extent=False)
def test_infer_gene_extent(): # Before we deprecate this, make sure it still works but emits a warning. with warnings.catch_warnings(record=True) as w: gffutils.create_db( gffutils.example_filename('FBgn0031208.gtf'), ':memory:', infer_gene_extent=False) assert len(w) == 1
def test_false_function(): # smoke test: before commit ce4b7671f, this would raise "TypeError: object # of type 'function' has no len()" db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ':memory:', keep_order=True, id_spec=lambda x: False, merge_strategy='create_unique')
def read_to_database(d): gff3 = gffutils.example_filename(d) db = gffutils.create_db(gff3, dbfn='gff3.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) return db
def setup(self): """ Creates a new GFFDB or GTFDB (depending on self.__class__.featureclass) """ self.featureclass = self.__class__.featureclass self.Feature = gffutils.Feature if self.featureclass == 'GFF': extension = '.gff' self.fn = gffutils.example_filename('FBgn0031208.gff') self.dbfn = testdbfn_gff if self.featureclass == 'GTF': extension = '.gtf' self.fn = gffutils.example_filename('FBgn0031208.gtf') self.dbfn = testdbfn_gtf self.G = gffutils.FeatureDB(self.dbfn) self.conn = sqlite3.connect(self.dbfn) self.c = self.conn.cursor()
def test_verbose(): # just a smoke test to make sure it runs actual_stderr = sys.stderr import StringIO sys.stderr = StringIO.StringIO() gffdb = gffutils.db.GFFDBCreator(gffutils.example_filename('FBgn0031208.gff'), 'deleteme.db', verbose=True, force=True).create() sys.stderr = actual_stderr os.unlink('deleteme.db')
def test_iterator_update(): db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ':memory:') assert len(list(db.all_features())) == 12 orig_exon_coords = set([(i.start, i.stop) for i in db.features_of_type('exon')]) # reset all features to have the same coords of start=1, stop=100 def gen(): for i in db.features_of_type('gene'): i.start = 1 i.stop = 100 yield i db.update(gen(), merge_strategy='replace') assert len(list(db.all_features())) == 12 assert len(list(db.features_of_type('gene'))) == 1 g = six.next(db.features_of_type('gene')) assert g.start == 1, g.start assert g.stop == 100, g.stop # exons should have remained unchanged. assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')]) def _transform(f): f.start = 1 f.stop = 100 return f db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ':memory:') db.update(db.features_of_type('gene'), merge_strategy='replace', transform=_transform) assert len(list(db.all_features())) == 12 assert len(list(db.features_of_type('gene'))) == 1 g = six.next(db.features_of_type('gene')) assert g.start == 1, g.start assert g.stop == 100, g.stop # exons should have remained unchanged. assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')])
def test_issue_82(): # key-val separator is inside an unquoted attribute value x = ( 'Spenn-ch12\tsgn_markers\tmatch\t2621812\t2622049\t.\t+\t.\t' 'Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126' ) y = feature.feature_from_line(x) assert y.attributes['Note'] == ['marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126'] gffutils.create_db(gffutils.example_filename('keyval_sep_in_attrs.gff'), ':memory:')
def parse_gff3(self): print("-------- Ensembl data Parsing --------") print("\tParsing gff3 file...") print("\tcreating temporary database from file: " + self.gff) fn = gffutils.example_filename(self.gff) db = gffutils.create_db(fn, ":memory:", merge_strategy="create_unique") # gffutils.create_db(fn, "DB.Ensembl_" + self.species[0] +".db", merge_strategy="create_unique") # db = gffutils.FeatureDB("DB.Ensembl_" + self.species[0] +".db") self.collect_genes(db) self.collect_Transcripts(db)
def test_sequence(): fasta = gffutils.example_filename('dm6-chr2L.fa') f = feature.feature_from_line('chr2L FlyBase gene 154 170 . + . ID=one;') seq = f.sequence(fasta) assert seq == 'aCGAGATGATAATATAT' assert len(seq) == len(f) f.strand = '-' seq = f.sequence(fasta) assert seq == 'ATATATTATCATCTCGt' assert len(seq) == len(f)
def test_pr_139(): db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ':memory:') exons = list(db.features_of_type('exon')) inter = list(db.interfeatures(exons)) # previously, the first exon's attributes would show up in subsequent merged features assert exons[0].attributes['Name'][0] not in inter[1].attributes['Name'] assert exons[0].attributes['Name'][0] not in inter[2].attributes['Name'] assert exons[0].attributes['Name'][0] not in inter[3].attributes['Name']
def test_issue_105(): fn = gffutils.example_filename('FBgn0031208.gtf') home = os.path.expanduser('~') newfn = os.path.join(home, '.gffutils.test') with open(newfn, 'w') as fout: fout.write(open(fn).read()) f = gffutils.iterators.DataIterator(newfn) for i in f: pass os.unlink(newfn)
def test_disable_infer(): """ tests the new semantics for disabling gene/transcript inference """ # To start, we construct a GTF db by inferring genes and transcripts db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'), ':memory:') # Then create a file missing transcripts, and another missing genes. import tempfile tempfile.tempdir = None no_transcripts = open(tempfile.NamedTemporaryFile(delete=False).name, 'w') no_genes = open(tempfile.NamedTemporaryFile(delete=False).name, 'w') for feature in db.all_features(): if feature.featuretype != 'transcript': no_transcripts.write(str(feature) + '\n') if feature.featuretype != 'gene': no_genes.write(str(feature) + '\n') no_genes.close() no_transcripts.close() no_tx_db = gffutils.create_db(no_transcripts.name, ':memory:', disable_infer_transcripts=True) no_gn_db = gffutils.create_db(no_genes.name, ':memory:', disable_infer_genes=True) no_xx_db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'), ':memory:', disable_infer_genes=True, disable_infer_transcripts=True) # no transcripts but 3 genes assert len(list(no_tx_db.features_of_type('transcript'))) == 0 assert len(list(no_tx_db.features_of_type('gene'))) == 3 # no genes but 4 transcripts assert len(list(no_gn_db.features_of_type('gene'))) == 0 assert len(list(no_gn_db.features_of_type('transcript'))) == 4 # no genes or transcripts assert len(list(no_xx_db.features_of_type('gene'))) == 0 assert len(list(no_xx_db.features_of_type('transcript'))) == 0
def test_false_function(): # smoke test: before commit ce4b7671f, this would raise "TypeError: object # of type 'function' has no len()" db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gff'), ':memory:', keep_order=True, id_spec=lambda x: False, merge_strategy='create_unique' )
def test_sequence(): fasta = gffutils.example_filename('dm6-chr2L.fa') f = feature.feature_from_line( 'chr2L FlyBase gene 154 170 . + . ID=one;') seq = f.sequence(fasta) assert seq == 'aCGAGATGATAATATAT' assert len(seq) == len(f) f.strand = '-' seq = f.sequence(fasta) assert seq == 'ATATATTATCATCTCGt' assert len(seq) == len(f)
def test_disable_infer(): """ tests the new semantics for disabling gene/transcript inference """ # To start, we construct a GTF db by inferring genes and transcripts db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'), ':memory:') # Then create a file missing transcripts, and another missing genes. import tempfile tempfile.tempdir = None no_transcripts = open(tempfile.NamedTemporaryFile(delete=False).name, 'w') no_genes = open(tempfile.NamedTemporaryFile(delete=False).name, 'w') for feature in db.all_features(): if feature.featuretype != 'transcript': no_transcripts.write(str(feature) + '\n') if feature.featuretype != 'gene': no_genes.write(str(feature) + '\n') no_genes.close() no_transcripts.close() no_tx_db = gffutils.create_db(no_transcripts.name, ':memory:', disable_infer_transcripts=True) no_gn_db = gffutils.create_db(no_genes.name, ':memory:', disable_infer_genes=True) no_xx_db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gtf'), ':memory:', disable_infer_genes=True, disable_infer_transcripts=True ) # no transcripts but 3 genes assert len(list(no_tx_db.features_of_type('transcript'))) == 0 assert len(list(no_tx_db.features_of_type('gene'))) == 3 # no genes but 4 transcripts assert len(list(no_gn_db.features_of_type('gene'))) == 0 assert len(list(no_gn_db.features_of_type('transcript'))) == 4 # no genes or transcripts assert len(list(no_xx_db.features_of_type('gene'))) == 0 assert len(list(no_xx_db.features_of_type('transcript'))) == 0
def test_nonascii(): # smoke test (prev. version returned Unicode) # db = gffutils.create_db(gffutils.example_filename('nonascii'), ":memory:") for i in db.all_features(): # this works in IPython, or using nosetests --with-doctest... try: print i # ...but fails using plain nosetests or when using regular Python # interpreter except UnicodeEncodeError: print unicode(i)
def test_issue_119(): db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:') db1 = gffutils.create_db(gffutils.example_filename('F3-unique-3.v2.gff'),':memory:') db2 = db1.update(db) obs = sorted(db2._autoincrements.keys()) assert obs == ['exon', 'read'], obs # More isolated test, merging two databases each created from the same file # which itself contains only a single feature with no ID. tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, 'w') as fout: fout.write('chr1\t.\tgene\t10\t15\t.\t+\t.\t\n') db3 = gffutils.create_db(tmp, ':memory:') assert db3._autoincrements == {'gene': 1} db4 = gffutils.create_db(tmp, ':memory:') assert db4._autoincrements == {'gene': 1} db5 = db3.update(db4) assert db5._autoincrements == {'gene': 2} assert db3._autoincrements == db5._autoincrements
def test_create_db_from_iter(): """ Test creation of FeatureDB from iterator. """ print("Testing creation of DB from iterator") db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ":memory:", keep_order=True) def my_iterator(): for rec in db.all_features(): yield rec new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True) print(list(new_db.all_features())) gene_feats = new_db.all_features(featuretype="gene") assert (len(list(gene_feats)) != 0), "Could not load genes from GFF."
def test_nonascii(): # smoke test (prev. version returned Unicode) # db = gffutils.create_db(gffutils.example_filename('nonascii'), ":memory:", keep_order=True) for i in db.all_features(): # this works in IPython, or using nosetests --with-doctest... try: print(i) # ...but fails using plain nosetests or when using regular Python # interpreter except UnicodeEncodeError: print(six.text_type(i))
def test_roundtrip(): """ Feature -> SeqFeature -> Feature should be invariant. """ db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ':memory:') feature = db['ENSMUSG00000033845'] feature.keep_order = True dialect = feature.dialect s = bp.to_seqfeature(feature) assert s.location.start.position == feature.start - 1 assert s.location.end.position == feature.stop f = bp.from_seqfeature(s, dialect=dialect, keep_order=True) assert feature == f
def test_create_db_from_url(): """ Test creation of FeatureDB from URL iterator. """ print("Testing creation of DB from URL iterator") # initially run SimpleHTTPServer at port 0 and os will take first available Handler = SimpleHTTPServer.SimpleHTTPRequestHandler httpd = SocketServer.TCPServer(("", 0), Handler) port = str(httpd.socket.getsockname()[1]) print("serving at port", port) # Serving test/data folder served_folder = gffutils.example_filename('') savedir = os.getcwd() os.chdir(served_folder) print("Starting SimpleHTTPServer in thread") server_thread = threading.Thread(target=httpd.serve_forever) server_thread.deamon = True server_thread.start() try: url = ''.join(['http://localhost:', port, '/gff_example1.gff3']) db = gffutils.create_db(url, ":memory:", keep_order=True) def my_iterator(): for rec in db.all_features(): yield rec new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True) print(list(new_db.all_features())) gene_feats = new_db.all_features(featuretype="gene") assert (len(list(gene_feats)) != 0), "Could not load genes from GFF." url = ''.join(['http://localhost:', port, '/gff_example1.gff3.gz']) db = gffutils.create_db(url, ":memory:", keep_order=True) def my_iterator(): for rec in db.all_features(): yield rec new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True) print(list(new_db.all_features())) gene_feats = new_db.all_features(featuretype="gene") assert (len(list(gene_feats)) != 0), "Could not load genes from GFF." finally: print('Server shutdown.') httpd.shutdown() server_thread.join() os.chdir(savedir)
def test_random_chr(): """ Test on GFF files with random chromosome events. """ gff_fname = gffutils.example_filename("random-chr.gff") db = helpers.get_gff_db(gff_fname) # Test that we can get children of only a selected type gene_id = \ "chr1_random:165882:165969:-@chr1_random:137473:137600:-@chr1_random:97006:97527:-" mRNAs = db.children(gene_id, featuretype="mRNA") for mRNA_entry in mRNAs: assert (mRNA_entry.featuretype == "mRNA"), \ "Not all entries are of type mRNA! %s" \ %(",".join([entry.featuretype for entry in mRNAs])) print("Parsed random chromosome successfully.")
def test_region(): db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ":memory:", keep_order=True) all_in_region = list(db.region("chr1:4000000-5000000")) all_minus = list(db.region("chr1:4000000-5000000:-")) all_plus = list(db.region("chr1:4000000-5000000:+")) all_unstranded = list(db.region("chr1:4000000-5000000:.")) out_of_range = list(db.region("nowhere:1-100")) assert len(all_in_region) == 12 assert len(all_minus) == 12 assert len(all_plus) == 0 assert len(all_unstranded) == 0 assert len(out_of_range) == 0
def test_sanitize_gff(): """ Test sanitization of GFF. Should be merged with GFF cleaning I believe unless they are intended to have different functionalities. """ # Get unsanitized GFF fn = gffutils.example_filename("unsanitized.gff") # Get its database db = helpers.get_gff_db(fn) # Sanitize the GFF sanitized_recs = helpers.sanitize_gff_db(db) # Ensure that sanitization work, meaning all # starts must be less than or equal to stops for rec in sanitized_recs.all_features(): assert (rec.start <= rec.stop), "Sanitization failed." print("Sanitized GFF successfully.")
def test_for_analyze(): db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gtf'), 'deleteme', force=True ) assert db._analyzed() db.execute('DROP TABLE sqlite_stat1') assert not db._analyzed() with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") db2 = gffutils.FeatureDB('deleteme') assert len(w) == 1 assert "analyze" in str(w[-1].message) db.analyze() assert db._analyzed() os.unlink('deleteme')
def test_gffwriter(): """ Test GFFWriter. """ print("Testing GFF writer..") fn = gffutils.example_filename("unsanitized.gff") # Make a copy of it as temporary named file temp_f = tempfile.NamedTemporaryFile(delete=False) temp_fname_source = temp_f.name shutil.copy(fn, temp_fname_source) # Now write file in place source_first_line = open(temp_fname_source, "r").readline().strip() assert (not source_first_line.startswith("#GFF3")), \ "unsanitized.gff should not have a gffutils-style header." db_in = gffutils.create_db(fn, ":memory:", keep_order=True) # Fetch first record rec = six.next(db_in.all_features()) ## ## Write GFF file in-place test ## print("Testing in-place writing") gff_out = gffwriter.GFFWriter(temp_fname_source, in_place=True, with_header=True) gff_out.write_rec(rec) gff_out.close() # Ensure that the file was written with header rewritten = open(temp_fname_source, "r") new_header = rewritten.readline().strip() assert new_header.startswith("#GFF3"), \ "GFFWriter serialized files should have a #GFF3 header." print(" - Wrote GFF file in-place successfully.") ## ## Write GFF file to new file test ## print("Testing writing to new file") new_file = tempfile.NamedTemporaryFile(delete=False) gff_out = gffwriter.GFFWriter(new_file.name) gff_out.write_rec(rec) gff_out.close() new_line = open(new_file.name, "r").readline().strip() assert new_line.startswith("#GFF3"), \ "GFFWriter could not write to a new GFF file." print(" - Wrote to new file successfully.")
def test_attributes_modify(): """ Test that attributes can be modified in a GFF record. """ # Test that attributes can be modified gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), testdbfn_gff, verbose=False, force=True) db = gffutils.FeatureDB(testdbfn_gff) gene_id = "FBgn0031208" gene_childs = list(db.children(gene_id)) print "old attributes: " print gene_childs[0].attributes assert str(gene_childs[0].attributes) == 'ID=FBtr0300689;Name=CG11023-RB;Parent=FBgn0031208;Dbxref=FlyBase_Annotation_IDs:CG11023-RB;score_text=Strongly Supported;score=11' gene_childs[0].attributes["ID"] = "Modified" print "new attributes: " print gene_childs[0].attributes assert str(gene_childs[0].attributes) == 'ID=Modified;Name=CG11023-RB;Parent=FBgn0031208;Dbxref=FlyBase_Annotation_IDs:CG11023-RB;score_text=Strongly Supported;score=11;ID=Modified'
def test_add_relation(): db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ':memory:', keep_order=True) L = len(list(db.children('FBgn0031208:3'))) assert L == 0, L def func(parent, child): child['Parent'] = child['Parent'] + [parent.id] child['exon_parent'] = [parent.id] return child db.add_relation('FBgn0031208:3', 'CDS_FBgn0031208:1_737', 1, child_func=func) L = len(list(db.children('FBgn0031208:3'))) assert L == 1, L L = list(db.children('FBgn0031208:3')) x = L[0] assert 'FBgn0031208:3' in x['Parent'] assert x['exon_parent'] == ['FBgn0031208:3']
def test_issue_79(): gtf = gffutils.example_filename('keep-order-test.gtf') db = gffutils.create_db(gtf, 'tmp.db', disable_infer_genes=False, disable_infer_transcripts=False, id_spec={"gene": "gene_id", "transcript": "transcript_id"}, merge_strategy="create_unique", keep_order=True, force=True) exp = open(gtf).read() obs = '\n'.join([str(i) for i in db.all_features()]) exp_1 = exp.splitlines(True)[0].strip() obs_1 = obs.splitlines(True)[0].strip() print('EXP') print(exp_1) print('OBS') print(obs_1) print('DIFF') print(''.join(difflib.ndiff([exp_1], [obs_1]))) assert obs_1 == exp_1
def test_tempfiles(): # specifiy a writeable temp dir for testing tempdir = '/tmp/gffutils-test' def clean_tempdir(): tempfile.tempdir = tempdir if os.path.exists(tempdir): shutil.rmtree(tempdir) os.makedirs(tempdir) clean_tempdir() # default keep_tempfiles=False should give us nothing. db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gtf'), ':memory:') assert len(os.listdir(tempdir)) == 0 # adding keep_tempfiles=True should give us 1 tempfile for gtf... db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gtf'), ':memory:', _keep_tempfiles=True) filelist = os.listdir(tempdir) assert len(filelist) == 1, filelist assert filelist[0].endswith('.gffutils') #...and another one for gff. This time, make sure the suffix db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gff'), ':memory:', _keep_tempfiles=True) filelist = os.listdir(tempdir) assert len(filelist) == 2, filelist for i in filelist: assert i.endswith('.gffutils') # OK, now delete what we have so far... clean_tempdir() # Make sure that works for custom suffixes db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gtf'), ':memory:', _keep_tempfiles='.GTFtmp') filelist = os.listdir(tempdir) assert len(filelist) == 1, filelist assert filelist[0].endswith('.GTFtmp') clean_tempdir() db = gffutils.create_db( gffutils.example_filename('FBgn0031208.gtf'), ':memory:', _keep_tempfiles='.GFFtmp') filelist = os.listdir(tempdir) assert len(filelist) == 1, filelist assert filelist[0].endswith('.GFFtmp') # Test n parallel instances of gffutils across PROCESSES processes. # # Note that travis-ci doesn't like it when you use multiple cores, so the # .travis.yml file sets this to 1. This also means that # 1) `n` shouldn't be too large because travis-ci will run one at a time, # but more importantly, # 2) this will only truly test parallel processes on a local machine with # multiple cpus. clean_tempdir() # .travis.yml sets the PROCESSES env var; otherwise use all available. PROCESSES = int(os.environ.get("PROCESSES", multiprocessing.cpu_count())) pool = multiprocessing.Pool(PROCESSES) n = 100 res = pool.map(make_db, range(n)) assert sorted(list(res)) == list(range(n)) filelist = os.listdir(tempdir) assert len(filelist) == n, len(filelist) expected = dedent("""\ FBtr0300689 chr2L 7529 9484 + transcript 4681 {"transcript_id":["FBtr0300689"],"gene_id":["FBgn0031208"]} FBgn0031208 chr2L 7529 9484 + gene 4681 {"gene_id":["FBgn0031208"]} FBtr0300690 chr2L 7529 9484 + transcript 4681 {"transcript_id":["FBtr0300690"],"gene_id":["FBgn0031208"]} transcript_Fk_gene_1 chr2L 10000 11000 - transcript 4681 {"transcript_id":["transcript_Fk_gene_1"],"gene_id":["Fk_gene_1"]} Fk_gene_1 chr2L 10000 11000 - gene 4681 {"gene_id":["Fk_gene_1"]} transcript_Fk_gene_2 chr2L 11500 12500 - transcript 4681 {"transcript_id":["transcript_Fk_gene_2"],"gene_id":["Fk_gene_2"]} Fk_gene_2 chr2L 11500 12500 - gene 4681 {"gene_id":["Fk_gene_2"]} """) def matches_expected(fn): """ Python 3 has unpredictable dictionary ordering. This function checks the *semantic* similarity of lines by parsing the attributes into a dictonary. """ exp_features = expected.splitlines(True) new_features = list(open(fn)) assert len(exp_features) == len(new_features) for expline, newline in zip(exp_features, new_features): exp_toks = expline.split() new_toks = newline.split() assert exp_toks[:-1] == new_toks[:-1] assert json.loads(exp_toks[-1]) == json.loads(new_toks[-1]) # make sure that each of the `n` files matches the expected output. for fn in filelist: fn = os.path.join(tempdir, fn) try: matches_expected(fn) except AssertionError: print(''.join(difflib.ndiff(expected.splitlines(True), this.splitlines(True)))) raise clean_tempdir()
def test_inspect(): file_results = inspect.inspect(gffutils.example_filename('FBgn0031208.gff'), verbose=False) db_results = inspect.inspect( gffutils.create_db( gffutils.example_filename('FBgn0031208.gff'), ':memory:'), verbose=False ) expected = { 'featuretype': { 'intron': 3, 'five_prime_UTR': 1, 'exon': 6, 'mRNA': 4, 'CDS': 5, 'pcr_product': 1, 'three_prime_UTR': 2, 'protein': 2, 'gene': 3, }, 'feature_count': 27, 'chrom': { 'chr2L': 27, }, 'attribute_keys': { u'': 3, 'Dbxref': 6, 'Name': 19, 'Parent': 20, ' Parent': 1, 'score_text': 2, 'gbunit': 1, 'derived_computed_cyto': 1, 'Derives_from': 2, 'derived_molecular_weight': 2, 'score': 2, 'ID': 25, 'derived_isoelectric_point': 2, 'Ontology_term': 1, } } assert file_results == db_results == expected # file and db work because db is created from kwargs = dict( look_for=['chrom', 'strand', 'attribute_keys', 'featuretype'], verbose=False, limit=10, ) file_results = inspect.inspect( gffutils.example_filename('FBgn0031208.gff'), **kwargs ) iter_results = inspect.inspect( iter(iterators._FileIterator(gffutils.example_filename('FBgn0031208.gff'))), **kwargs ) db_results = inspect.inspect( gffutils.create_db( gffutils.example_filename('FBgn0031208.gff'), ':memory:'), **kwargs ) expected = { 'attribute_keys': { u'Name': 9, u'Parent': 9, u'score_text': 2, u'gbunit': 1, u'derived_computed_cyto': 1, u'score': 2, u'Dbxref': 3, u'ID': 8, u'Ontology_term': 1, }, 'feature_count': 10, 'chrom': {u'chr2L': 10}, 'strand': {u'+': 10}, 'featuretype': { u'five_prime_UTR': 1, u'exon': 3, u'mRNA': 2, u'CDS': 1, 'intron': 2, u'gene': 1} } assert file_results == db_results == iter_results == expected
def test_update(): # check both in-memory and file-based dbs db = create.create_db( example_filename('FBgn0031208.gff'), ':memory:', verbose=False, keep_order=True, force=True) orig_num_features = len(list(db.all_features())) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) # no merge strategy required because we're adding a new feature db.update([f]) x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1", str(x) # ought to be one more now . . . num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Now try updating with the same feature, but using merge_strategy="merge", # which appends items to attributes ( n=1 --> n=1,2 ) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['2'] db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 # Merging does a list(set()) operation, so the order is not guaranteed. # Fix it here for testing... x = x[0] x.attributes['n'].sort() assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1,2", str(x) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Merging while iterating. e.g., if you're updating children with gene # IDs. db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:', verbose=False, force=True, keep_order=True) for gene in db.features_of_type('gene'): for child in list(db.children(gene)): # important: the FBgn0031208.gff file was designed to have some # funky features: there are two exons without ID attributes. These # are assigned to ids "exon_1" and "exon_2". Upon update, with # still no ID, we then have two new features "exon_3" and "exon_4". # To prevent this issue, we ensure that the ID attribute exists... child.attributes['gene_id'] = [gene.id] if 'ID' not in child.attributes: child.attributes['ID'] = [child.id] db.update([child], merge_strategy='replace') print("\n\nafter\n\n") for child in db.children(gene): print(child.id) assert child.attributes['gene_id'] == ['FBgn0031208'], (child, child.attributes) num_entries = 0 for gene_recs in list(db.iter_by_parent_childs()): # Add attribute to each gene record rec = gene_recs[0] rec.attributes["new"] = ["new_value"] db.update([rec]) num_entries += 1 print(list(db.all_features())) assert (num_entries > 1), "Only %d left after update" % (num_entries) # Replace f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['3'] db.update([f], merge_strategy='replace') x = list(db.features_of_type('testing')) assert len(x) == 1 assert str(x[0]) == "chr2L . testing 1 10 . + . ID=testing_feature;n=3", str(x[0]) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Same thing, but GTF instead of GFF. db = create.create_db( example_filename('FBgn0031208.gtf'), ':memory:', verbose=False, force=True, keep_order=True) f = feature.feature_from_line('chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False) f.keep_order = True db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True # note the trailing semicolon. That's because the db's dialect has # ['trailing semicolon'] = True. assert str(x) == 'chr2L . testing 1 10 . + . gene_id "fake"; n "1";', str(x)