Beispiel #1
0
    def test_summarise_genes(self):
        # Set up some test data
        gene_data = """##gff-version 3
#description: test data
chr1\tme\tgene\t1\t1000\t.\t+\t.\tID=gene1
chr1\tme\tgene\t1\t1000\t.\t+\t.\tID=gene2
"""
        transcript_data = """##gff-version 3
#description: test data
chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript1;Parent=gene1
chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript2;Parent=gene1
"""
        exon_data = """##gff-version 3
#description: test data
chr1\tme\texon\t1\t400\t.\t+\t.\tID=exon1;Parent=transcript1
chr1\tme\texon\t500\t1000\t.\t+\t.\tID=exon2;Parent=transcript1
"""
        genes = gff.parse_gff3_to_dataframe(io.StringIO(gene_data))
        transcripts = gff.parse_gff3_to_dataframe(io.StringIO(transcript_data))
        exons = gff.parse_gff3_to_dataframe(io.StringIO(exon_data))

        summary = gff.summarise_genes(genes, transcripts, exons)
        # Or use the python version:
        # summary = gff.summarise_genes_python_version( genes, transcripts, exons )
        self.assertEqual(summary['ID'][0], 'gene1')
        self.assertEqual(summary['ID'][1], 'gene2')
        self.assertEqual(summary['number_of_transcripts'][0], 2)
        self.assertEqual(summary['number_of_transcripts'][1], 0)
        self.assertEqual(summary['average_number_of_exons'][0], 1)
        import math
        self.assertTrue(math.isnan(summary['average_number_of_exons'][1]))
Beispiel #2
0
def process(args):
    print("++ Loading genes data from %s...\n" % args.input)
    data = gff.parse_gff3_to_dataframe(open(args.input))
    print("++ ok, %d records loaded, they look like:\n" % data.shape[0])
    print(data)

    print("++ Loading sequence lengths from %s...\n" % args.input)
    sequences = gff.parse_sequences_from_gff_metadata(open(args.input))
    print("++ ok, %d records loaded, they look like:\n" % sequences.shape[0])
    print(sequences)

    print("++ Writing records to %s...\n" % args.output)
    db = sqlite3.connect(args.output)

    # In this version I have hard-coded `gff_data` and `sequences` table names.
    data.insert(0, 'analysis', args.analysis)
    data.to_sql('gff_data',
                db,
                index=False,
                if_exists='replace' if args.overwrite else 'append')
    sequences.insert(0, 'analysis', args.analysis)
    sequences.to_sql("sequences",
                     db,
                     index=False,
                     if_exists='replace' if args.overwrite else 'append')

    print("++ Success.\n")
Beispiel #3
0
    def test_count_exons_per_transcript(self):
        # Set up some test data
        transcript_data = """##gff-version 3
#description: test data
chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript1;Parent=gene1
chr1\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript2;Parent=gene1
"""
        exon_data = """##gff-version 3
#description: test data
chr1\tme\texon\t1\t400\t.\t+\t.\tID=exon1;Parent=transcript1
chr1\tme\texon\t500\t1000\t.\t+\t.\tID=exon2;Parent=transcript1
"""
        transcripts = gff.parse_gff3_to_dataframe(io.StringIO(transcript_data))
        exons = gff.parse_gff3_to_dataframe(io.StringIO(exon_data))
        summary = gff.count_exons_per_transcript(transcripts, exons)
        self.assertEqual(summary['ID'][0], 'transcript1')
        self.assertEqual(summary['ID'][1], 'transcript2')
        self.assertEqual(summary['Parent'][0], 'gene1')
        self.assertEqual(summary['Parent'][1], 'gene1')
        self.assertEqual(summary['number_of_exons'][0], 2)
        self.assertEqual(summary['number_of_exons'][1], 0)
Beispiel #4
0
    def test_parse_gff3_to_dataframe(self):
        test_data = """##gff-version 3
#description: test data
chr1\tme\tgene\t1\t1000\t.\t+\t.\tID=gene1;other_data=stuff
chr1\tme\texon\t1\t1000\t.\t+\t.\tID=gene1.1;Parent=gene1
chr10\tme\tgene\t1\t1000\t.\t+\t.\tID=gene2;gene_id=my_test_gene
chr10\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript1;Parent=gene2
chr10\tme\texon\t1\t1000\t.\t+\t.\tID=my_test_exon;Parent=transcript1
chr10\tme\ttranscript\t1\t1000\t.\t+\t.\tID=transcript2;Parent=gene2
        """

        X = gff.parse_gff3_to_dataframe(io.StringIO(test_data))

        self.assertEqual(X.loc[0]['ID'], 'gene1')
        self.assertEqual(X.loc[1]['ID'], 'gene1.1')
        self.assertEqual(X.loc[1]['Parent'], 'gene1')
        self.assertEqual(X.loc[2]['start'], 1)
        self.assertEqual(X.loc[2]['end'], 1000)
        self.assertEqual(X.loc[2]['end'], 1000)
def process(args):
    print("++ Loading genes data from %s...\n" % args.input)
    data = gff.parse_gff3_to_dataframe(open(args.input))
    print("++ ok, %d records loaded, they look like:\n" % data.shape[0])
    print(data)

    print("++ Writing records to %s:%s...\n" % (args.output, args.table))
    db = sqlite3.connect(args.output)

    # Pandas has a handy to_sql method for this.
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html
    # First we add the 'analysis' column
    data.insert(0, 'analysis', args.analysis)
    data.to_sql(args.table,
                db,
                index=False,
                if_exists='replace' if args.overwrite else 'append')
    #print( "++ Indexing ID field...\n" )
    # Indexing is a good idea.  But if we are doing multiple datasets
    # it is faster to index them after, by running this SQL on the db:
    #db.execute( "CREATE INDEX IF NOT EXISTS `%s_ID_index` ON `%s` ( ID )" % ( args.table, args.table ))
    #db.commit()
    print("++ Success.\n")