def test_summarise_genes(self): # Set up some test data gene_data = """##gff-version 3 #description: test data chr1\tme\tgene\t1\t1000\t.\t+\t.\tID=gene1 chr1\tme\tgene\t1\t1000\t.\t+\t.\tID=gene2 """ transcript_data = """##gff-version 3 #description: test data chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript1;Parent=gene1 chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript2;Parent=gene1 """ exon_data = """##gff-version 3 #description: test data chr1\tme\texon\t1\t400\t.\t+\t.\tID=exon1;Parent=transcript1 chr1\tme\texon\t500\t1000\t.\t+\t.\tID=exon2;Parent=transcript1 """ genes = gff.parse_gff3_to_dataframe(io.StringIO(gene_data)) transcripts = gff.parse_gff3_to_dataframe(io.StringIO(transcript_data)) exons = gff.parse_gff3_to_dataframe(io.StringIO(exon_data)) summary = gff.summarise_genes(genes, transcripts, exons) # Or use the python version: # summary = gff.summarise_genes_python_version( genes, transcripts, exons ) self.assertEqual(summary['ID'][0], 'gene1') self.assertEqual(summary['ID'][1], 'gene2') self.assertEqual(summary['number_of_transcripts'][0], 2) self.assertEqual(summary['number_of_transcripts'][1], 0) self.assertEqual(summary['average_number_of_exons'][0], 1) import math self.assertTrue(math.isnan(summary['average_number_of_exons'][1]))
def process(args): print("++ Loading genes data from %s...\n" % args.input) data = gff.parse_gff3_to_dataframe(open(args.input)) print("++ ok, %d records loaded, they look like:\n" % data.shape[0]) print(data) print("++ Loading sequence lengths from %s...\n" % args.input) sequences = gff.parse_sequences_from_gff_metadata(open(args.input)) print("++ ok, %d records loaded, they look like:\n" % sequences.shape[0]) print(sequences) print("++ Writing records to %s...\n" % args.output) db = sqlite3.connect(args.output) # In this version I have hard-coded `gff_data` and `sequences` table names. data.insert(0, 'analysis', args.analysis) data.to_sql('gff_data', db, index=False, if_exists='replace' if args.overwrite else 'append') sequences.insert(0, 'analysis', args.analysis) sequences.to_sql("sequences", db, index=False, if_exists='replace' if args.overwrite else 'append') print("++ Success.\n")
def test_count_exons_per_transcript(self): # Set up some test data transcript_data = """##gff-version 3 #description: test data chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript1;Parent=gene1 chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript2;Parent=gene1 """ exon_data = """##gff-version 3 #description: test data chr1\tme\texon\t1\t400\t.\t+\t.\tID=exon1;Parent=transcript1 chr1\tme\texon\t500\t1000\t.\t+\t.\tID=exon2;Parent=transcript1 """ transcripts = gff.parse_gff3_to_dataframe(io.StringIO(transcript_data)) exons = gff.parse_gff3_to_dataframe(io.StringIO(exon_data)) summary = gff.count_exons_per_transcript(transcripts, exons) self.assertEqual(summary['ID'][0], 'transcript1') self.assertEqual(summary['ID'][1], 'transcript2') self.assertEqual(summary['Parent'][0], 'gene1') self.assertEqual(summary['Parent'][1], 'gene1') self.assertEqual(summary['number_of_exons'][0], 2) self.assertEqual(summary['number_of_exons'][1], 0)
def test_parse_gff3_to_dataframe(self): test_data = """##gff-version 3 #description: test data chr1\tme\tgene\t1\t1000\t.\t+\t.\tID=gene1;other_data=stuff chr1\tme\texon\t1\t1000\t.\t+\t.\tID=gene1.1;Parent=gene1 chr10\tme\tgene\t1\t1000\t.\t+\t.\tID=gene2;gene_id=my_test_gene chr10\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript1;Parent=gene2 chr10\tme\texon\t1\t1000\t.\t+\t.\tID=my_test_exon;Parent=transcript1 chr10\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript2;Parent=gene2 """ X = gff.parse_gff3_to_dataframe(io.StringIO(test_data)) self.assertEqual(X.loc[0]['ID'], 'gene1') self.assertEqual(X.loc[1]['ID'], 'gene1.1') self.assertEqual(X.loc[1]['Parent'], 'gene1') self.assertEqual(X.loc[2]['start'], 1) self.assertEqual(X.loc[2]['end'], 1000) self.assertEqual(X.loc[2]['end'], 1000)
def process(args): print("++ Loading genes data from %s...\n" % args.input) data = gff.parse_gff3_to_dataframe(open(args.input)) print("++ ok, %d records loaded, they look like:\n" % data.shape[0]) print(data) print("++ Writing records to %s:%s...\n" % (args.output, args.table)) db = sqlite3.connect(args.output) # Pandas has a handy to_sql method for this. # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html # First we add the 'analysis' column data.insert(0, 'analysis', args.analysis) data.to_sql(args.table, db, index=False, if_exists='replace' if args.overwrite else 'append') #print( "++ Indexing ID field...\n" ) # Indexing is a good idea. But if we are doing multiple datasets # it is faster to index them after, by running this SQL on the db: #db.execute( "CREATE INDEX IF NOT EXISTS `%s_ID_index` ON `%s` ( ID )" % ( args.table, args.table )) #db.commit() print("++ Success.\n")