Exemple #1
0
    def test_novelty_merging(self):
        a_gtf = 'input_files/annot.gtf'
        b_gtf = 'input_files/annot_2.gtf'

        # only one dataset has novelty categorizations
        # and there are transcripts in the dataset that
        # doesn't have categories that aren't in the
        # dataset that does
        print('Testing if novelty merging works...')
        sg = swan.SwanGraph()
        sg.add_dataset('a', a_gtf, include_isms=True)
        sg.t_df['novelty'] = ['Known', 'NIC', 'ISM']
        sg.add_dataset('b', b_gtf, include_isms=True)
        test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
        control = [('ENST01', 'Known'), ('ENST02', 'Undefined'),
                   ('ENST08', 'Undefined'), ('ENST03', 'NIC'),
                   ('ENST04', 'Undefined'), ('ENST07', 'ISM')]
        check_pairs(control, test)

        # one dataset has already been given "undefined" label
        # that needs to be overwritten
        print('Testing merging with novelty=undefined given a new label')
        sg_b = swan.SwanGraph()
        sg_b.add_dataset('b_2', b_gtf, include_isms=True)
        sg_b.t_df['novelty'] = ['ISM', 'Antisense', 'Intergenic', 'Genomic']

        print(sg.t_df)
        print(sg_b.t_df)
        sg.merge_dfs(sg_b, 'b_2')
        control = [('ENST01', 'Ambiguous'), ('ENST02', 'Antisense'),
                   ('ENST08', 'Genomic'), ('ENST03', 'NIC'),
                   ('ENST04', 'Intergenic'), ('ENST07', 'ISM')]
        test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
        check_pairs(control, test)

        # both datasets have novelty categorizations
        ab_gtf = 'input_files/annot_3.gtf'
        sg_a = swan.SwanGraph()
        sg_a.add_dataset('a', ab_gtf, include_isms=True)
        sg_a.t_df['novelty'] = ['Known', 'Known', 'ISM', np.nan, 'NIC', 'NNC']
        sg_b = swan.SwanGraph()
        sg_b.add_dataset('b', ab_gtf, include_isms=True)
        sg_b.t_df['novelty'] = ['Known', 'ISM', np.nan, 'NIC', 'NNC', 'NNC']
        sg_a.merge_dfs(sg_b, 'b')
        test = sg_a.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
        control = [('ENST01', 'Known'), ('ENST02', 'Ambiguous'),
                   ('ENST08', 'NNC'), ('ENST03', 'ISM'), ('ENST04', 'NIC'),
                   ('ENST07', 'Ambiguous')]
        check_pairs(control, test)

        # neither dataset has novelty types
        sg = swan.SwanGraph()
        sg.add_dataset('a', a_gtf, include_isms=True)
        sg.add_dataset('b', b_gtf, include_isms=True)
        assert 'novelty' not in sg.t_df.columns
Exemple #2
0
    def test_check_abundances(self):
        g = swan.SwanGraph()
        g.datasets = ['a', 'b', 'c']
        g.counts = ['a_counts', 'b_counts']

        # 1: test for abundance that isn't there
        # also makes sure it works with character input (ie not list)
        with pytest.raises(Exception) as excinfo:
            g.check_abundances('d')
        assert 'Abundance for dataset d' in str(excinfo.value)

        # 2: test for a abundance from a dataset that is there but
        # does not have abundance info
        # also makes sure it works with len(list) = 1 info
        with pytest.raises(Exception) as excinfo:
            g.check_abundances(['c'])
        assert 'Abundance for dataset c' in str(excinfo.value)

        # 3: test for multiple datasets that are in the graph
        result = g.check_abundances(['a', 'b'])
        assert result == None

        # 4: test for a dataset that's not there in a list
        # of datasets that are there
        with pytest.raises(Exception) as excinfo:
            g.check_abundances(['a', 'b', 'd'])
        assert 'Abundance for dataset d' in str(excinfo.value)
Exemple #3
0
    def test_no_gene_name_gtf(self):
        sg = swan.SwanGraph()
        sg.add_dataset('test', 'input_files/Canx.gtf')

        gnames = sg.t_df.gname.tolist()
        gids = sg.t_df.gid.tolist()

        assert gnames == gids
Exemple #4
0
    def test_no_gene_name_db(self):
        sg = swan.SwanGraph()
        sg.add_dataset('test', 'input_files/chr11_and_Tcf3_no_gname.db')

        gnames = sg.t_df.gname.tolist()
        gids = sg.t_df.gid.tolist()

        assert gnames == gids
Exemple #5
0
    def test_get_tpm_cols(self):
        a = swan.SwanGraph()

        # 1: empty graph, no datasets argument
        assert a.get_tpm_cols() == []

        # 2: one dataset in graph, no datasets argument
        a.tpm = ['a_tpm']
        assert a.get_tpm_cols() == ['a_tpm']

        # 3: > one dataset in graph, no datasets argument
        a.tpm = ['a_tpm', 'b_tpm']
        assert a.get_tpm_cols() == ['a_tpm', 'b_tpm']

        # 4: empty graph, datasets argument
        # also tests ability to handle char input
        a = swan.SwanGraph()
        with pytest.raises(Exception) as excinfo:
            a.get_tpm_cols('a')
        assert 'Abundance for dataset a' in str(excinfo.value)

        # 5: graph with one dataset and datasets argument already in graph
        a.tpm = ['a_tpm']
        a.counts = ['a_counts']
        assert a.get_tpm_cols('a') == ['a_tpm']

        # 6: graph with one dataset and datasets argument not in graph
        with pytest.raises(Exception) as excinfo:
            a.get_tpm_cols('b')
        assert 'Abundance for dataset b' in str(excinfo.value)

        # 7: graph with more than one dataset and datasets argument in graph
        a.tpm = ['a_tpm', 'b_tpm']
        a.counts = ['a_counts', 'b_counts']
        assert a.get_tpm_cols(['a', 'b']) == ['a_tpm', 'b_tpm']

        # 8: graph with more than one dataset and not all datasets argument in graph
        with pytest.raises(Exception) as excinfo:
            a.get_tpm_cols(['a', 'b', 'c'])
        assert 'Abundance for dataset c'
Exemple #6
0
def get_dummy_sg(special=None):
    a = swan.SwanGraph()

    loc_df = pd.DataFrame({
        'chrom': [1, 1, 1],
        'coord': [1, 3, 2],
        'strand': ['+', '+', '+'],
        'vertex_id': [0, 1, 2]
    })
    loc_df = swan.create_dupe_index(loc_df, 'vertex_id')
    loc_df = swan.set_dupe_index(loc_df, 'vertex_id')

    edge_df = pd.DataFrame({
        'edge_id': [(0, 2), (0, 1), (1, 2)],
        'v1': [0, 0, 1],
        'v2': [2, 1, 2],
        'edge_type': ['exon', 'exon', 'intron'],
        'strand': ['+', '+', '+']
    })
    edge_df = swan.create_dupe_index(edge_df, 'edge_id')
    edge_df = swan.set_dupe_index(edge_df, 'edge_id')

    t_df = pd.DataFrame({
        'tid': [2, 1, 0],
        'gid': [0, 0, 0],
        'gname': ['0', '0', '0'],
        'path': [[0, 1, 2], [1, 2], [0, 1]],
        'counts_a': [0, 0, 12],
        'counts_b': [1, 0, 14]
    })
    t_df = swan.create_dupe_index(t_df, 'tid')
    t_df = swan.set_dupe_index(t_df, 'tid')

    if special == 'intron':
        edge_df.loc[(0, 1), 'edge_type'] = 'intron'

    a.loc_df = loc_df
    a.edge_df = edge_df
    a.t_df = t_df

    if special == 'no_locs':
        pass
    else:
        a.get_loc_types()

    if special == 'dataset':
        a.datasets = ['dataset_a']
        a.loc_df['dataset_a'] = True
        a.edge_df['dataset_a'] = True
        a.t_df['dataset_a'] = True

    return a
Exemple #7
0
    def test_add_annotation(self):

        # adding an annotation to an empty graph
        print('testing for correct novelty assignment when adding annotation')
        sg = swan.SwanGraph()
        sg.add_annotation('input_files/annot.gtf')
        control = len(sg.t_df.index)
        test = len(sg.t_df.loc[sg.t_df.novelty == 'Known'].index)
        assert control == test

        # adding an annotation to a graph that already has data
        # but preexisting data does not have novelty categories
        print(
            'testing for correct novelty assignment when adding annotation '
            'to graph with preexisting data that does not contain novelty info'
        )
        sg = swan.SwanGraph()
        sg.add_dataset('a', 'input_files/annot_2.gtf')
        sg.add_annotation('input_files/annot.gtf')
        control = [('ENST01', 'Known'), ('ENST03', 'Known'),
                   ('ENST07', 'Known'), ('ENST02', 'Undefined'),
                   ('ENST04', 'Undefined'), ('ENST08', 'Undefined')]
        test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
        check_pairs(control, test)

        # adding an annotation to a graph that already has data
        # and preexisting data has novelty categories
        print('testing for correct novelty assignment when adding annotation '
              'to graph with preexisting data that does contain novelty info')
        sg = swan.SwanGraph()
        sg.add_dataset('a', 'input_files/annot_2.gtf')
        sg.t_df['novelty'] = ['ISM', 'NNC', 'NIC', 'NIC']
        sg.add_annotation('input_files/annot.gtf')
        control = [('ENST01', 'Known'), ('ENST03', 'Known'),
                   ('ENST07', 'Known'), ('ENST02', 'NNC'), ('ENST04', 'NIC'),
                   ('ENST08', 'NIC')]
        test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1)
        check_pairs(control, test)
Exemple #8
0
    def test_is_empty(self):
        a = swan.SwanGraph()

        # 1: graph is empty, no datasets (including annotation)
        print(a.datasets)
        assert a.is_empty() == True

        a.datasets = ['a']

        # 2: graph has one dataset
        print(a.datasets)
        assert a.is_empty() == False

        a.datasets = ['a', 'b']

        # 3: graph has more than one dataset
        print(a.datasets)
        assert a.is_empty() == False
Exemple #9
0
    def test_get_dataset_cols(self):
        a = swan.SwanGraph()

        # 1: graph is empty, no datasets (including annotation)
        print(a.datasets)
        assert a.get_dataset_cols() == []

        a.datasets = ['a']

        # 2: graph has one dataset
        print(a.datasets)
        assert a.get_dataset_cols() == ['a']

        a.datasets = ['a', 'b']

        # 3: graph has more than one dataset
        print(a.datasets)
        assert a.get_dataset_cols() == ['a', 'b']
Exemple #10
0
    def test_subset_on_gene(self):
        gid = 0
        a = swan.SwanGraph()
        a.t_df = pd.DataFrame({
            'tid': [0, 1, 2, 3, 4, 5],
            'gid': [0, 0, 1, 1, 2, 2],
            'path': [[0, 1, 2], [2, 3, 4], [5, 6, 7], [6, 7, 8],
                     [9, 10, 11, 12], [9, 11, 12]]
        })
        a.loc_df = pd.DataFrame({
            'vertex_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
            'strand':
            ['+', '+', '+', '+', '+', '-', '-', '-', '-', '-', '-', '-', '-'],
            'chrom': [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2],
            'coord': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        })
        a.edge_df = pd.DataFrame({
            'edge_id': [(0, 1), (1, 2), (2, 3), (3, 4), (5, 6), (6, 7), (7, 8),
                        (9, 10), (10, 11), (11, 12)]
        })
        a.t_df = swan.create_dupe_index(a.t_df, 'tid')
        a.t_df = swan.set_dupe_index(a.t_df, 'tid')
        a.loc_df = swan.create_dupe_index(a.loc_df, 'vertex_id')
        a.loc_df = swan.set_dupe_index(a.loc_df, 'vertex_id')
        a.edge_df = swan.create_dupe_index(a.edge_df, 'edge_id')
        a.edge_df = swan.set_dupe_index(a.edge_df, 'edge_id')

        # check subsetting for gene 0
        a = swan.subset_on_gene(a, 0)

        test = a.t_df['tid'].tolist()
        control = [0, 1]
        check_pairs(control, test)

        test = a.loc_df['vertex_id'].tolist()
        control = [0, 1, 2, 3, 4]
        check_pairs(control, test)

        test = a.edge_df['edge_id'].tolist()
        control = [(0, 1), (1, 2), (2, 3), (3, 4)]
        check_pairs(control, test)
Exemple #11
0
    def test_check_datasets(self):

        g = swan.SwanGraph()
        g.datasets = ['a', 'b']

        # 1: test for a dataset that's not there
        with pytest.raises(Exception) as excinfo:
            g.check_datasets(['c'])
        assert "c not present in graph" in str(excinfo.value)

        # 2: test for a dataset that is there, input in string form
        result = g.check_datasets('a')
        assert result == None

        # 3: test for multiple datasets that are in the graph
        result = g.check_datasets(['a', 'b'])
        assert result == None

        # 4: test for a dataset that's not there in a list
        # of datasets that are there
        with pytest.raises(Exception) as excinfo:
            g.check_datasets(['a', 'b', 'c'])
        assert 'c not present in graph' in str(excinfo.value)
Exemple #12
0
    def test_plotting(self):
        # set up testing swangraph
        sg = swan.SwanGraph()
        sg.datasets = ['annotation', 'a', 'b']
        gene1_loc_df = pd.DataFrame({
            'chrom': [1, 1, 1, 1, 1, 1, 1],
            'coord': [5, 10, 15, 20, 25, 30, 35],
            'strand': ['+', '+', '+', '+', '+', '+', '+'],
            'vertex_id': [0, 1, 2, 3, 4, 5, 6],
            'annotation': [True, True, True, True, True, True, False],
            'a': [True, True, False, True, True, True, True],
            'b': [False, False, True, True, True, True, False]
        })
        gene1_t_df = pd.DataFrame({
            'gname': ['GENE01', 'GENE01', 'GENE01', 'GENE01'],
            'gid': ['ENSG01', 'ENSG01', 'ENSG01', 'ENSG01'],
            'tid': ['ENST01', 'ENST02', 'ENST03', 'ENST04'],
            'path': [[0, 1, 2, 3, 4, 5], [0, 3, 4, 5], [0, 1, 4, 6],
                     [2, 3, 4, 5]],
            'annotation': [True, True, True, False],
            'a': [False, True, True, False],
            'b': [False, False, False, True]
        })
        gene1_edge_df = pd.DataFrame({
            'v1': [0, 1, 2, 3, 4, 0, 4, 1],
            'v2': [1, 2, 3, 4, 5, 3, 6, 4],
            'edge_type': [
                'exon', 'intron', 'exon', 'intron', 'exon', 'exon', 'exon',
                'intron'
            ],
            'annotation': [True, True, True, True, True, True, False, False],
            'a': [True, False, False, True, True, True, True, True],
            'b': [False, False, True, True, True, False, False, False]
        })
        gene1_edge_df['edge_id'] = gene1_edge_df.apply(lambda x: (x.v1, x.v2),
                                                       axis=1)

        sg.loc_df = gene1_loc_df
        sg.edge_df = gene1_edge_df
        sg.t_df = gene1_t_df

        sg.loc_df = create_dupe_index(sg.loc_df, 'vertex_id')
        sg.loc_df = set_dupe_index(sg.loc_df, 'vertex_id')
        sg.edge_df = create_dupe_index(sg.edge_df, 'edge_id')
        sg.edge_df = set_dupe_index(sg.edge_df, 'edge_id')
        sg.t_df = create_dupe_index(sg.t_df, 'tid')
        sg.t_df = set_dupe_index(sg.t_df, 'tid')
        sg.get_loc_types()

        # testing
        gene1_tids = gene1_t_df.tid.tolist()
        gene1_locs = gene1_loc_df.vertex_id.tolist()
        gene1_edges = gene1_edge_df.edge_id.tolist()

        # 0th plot - gene summary graph of ENSG01
        sg.datasets = ['annotation', 'a']
        sg = plot0(sg, gene1_tids, gene1_locs, gene1_edges)

        gene2_loc_df = pd.DataFrame({
            'chrom': [4, 4, 4, 4, 4, 4, 4],
            'coord': [35, 30, 25, 20, 15, 10, 5],
            'strand': ['-', '-', '-', '-', '-', '-', '-'],
            # 'vertex_id': [0, 1, 2, 3,  4,  5,  6],
            'vertex_id': [7, 8, 9, 10, 11, 12, 13],
            'annotation': [True, True, True, True, True, True, False],
            'a': [True, True, False, True, True, True, True],
            'b': [False, False, True, True, True, True, False]
        })
        gene2_t_df = pd.DataFrame({
            'gname': ['GENE02', 'GENE02', 'GENE02', 'GENE02'],
            'gid': ['ENSG02', 'ENSG02', 'ENSG02', 'ENSG02'],
            'tid': ['ENST05', 'ENST06', 'ENST07', 'ENST08'],
            'path': [[7, 8, 9, 10, 11, 12], [7, 10, 11, 12], [7, 8, 11, 13],
                     [9, 10, 11, 12]],
            'annotation': [True, True, True, False],
            'a': [False, True, True, False],
            'b': [False, False, False, True]
        })
        gene2_edge_df = pd.DataFrame({
            'v1': [7, 8, 9, 10, 11, 7, 11, 8],
            'v2': [8, 9, 10, 11, 12, 10, 13, 11],
            'edge_type': [
                'exon', 'intron', 'exon', 'intron', 'exon', 'exon', 'exon',
                'intron'
            ],
            'annotation': [True, True, True, True, True, True, False, False],
            'a': [True, False, False, True, True, True, True, True],
            'b': [False, False, True, True, True, False, False, False]
        })
        gene2_edge_df['edge_id'] = gene2_edge_df.apply(lambda x: (x.v1, x.v2),
                                                       axis=1)

        sg.loc_df = pd.concat([gene1_loc_df, gene2_loc_df])
        sg.edge_df = pd.concat([gene1_edge_df, gene2_edge_df])
        sg.t_df = pd.concat([gene1_t_df, gene2_t_df])

        sg.loc_df = create_dupe_index(sg.loc_df, 'vertex_id')
        sg.loc_df = set_dupe_index(sg.loc_df, 'vertex_id')
        sg.edge_df = create_dupe_index(sg.edge_df, 'edge_id')
        sg.edge_df = set_dupe_index(sg.edge_df, 'edge_id')
        sg.t_df = create_dupe_index(sg.t_df, 'tid')
        sg.t_df = set_dupe_index(sg.t_df, 'tid')
        sg.get_loc_types()

        # testing
        gene2_tids = gene2_t_df.tid.tolist()
        gene2_locs = gene2_loc_df.vertex_id.tolist()
        gene2_edges = gene2_edge_df.edge_id.tolist()

        # remake the same plot and force it to update
        sg.datasets = ['annotation', 'a', 'b']
        sg = plot0_5(sg, gene2_tids, gene1_locs, gene1_edges)

        # first plot - gene summary graph of ENSG01
        sg = plot1(sg, gene1_tids, gene1_locs, gene1_edges)

        # plot a transcript through the same gene
        sg = plot2(sg, gene1_tids, gene1_locs, gene1_edges)

        # make sure we are doing the right thing after plotting ENST01
        # after plotting it as a browser image
        sg = plot3(sg, gene1_tids, gene1_locs, gene1_edges)

        # plot the same transcript with indicate_novel
        sg = plot4(sg, gene1_tids, gene1_locs, gene1_edges)

        # plot a different transcript but change the indicate opt
        sg = plot5(sg, gene1_tids, gene1_locs, gene1_edges)

        # plot a new gene and use indicate_dataset
        sg = plot6(sg, gene2_tids, gene1_locs, gene1_edges)

        # plot a transcript from the other gene using browser
        sg = plot7(sg, gene1_tids, gene1_locs, gene1_edges)

        # plot a transcript from the other gene using
        # indicate_dataset b
        sg = plot8(sg, gene2_tids, gene1_locs, gene1_edges)
Exemple #13
0
def gen_toy_sg():
    sg = swan.SwanGraph()
    sg.add_dataset('a', 'input_files/annot.gtf')
    return sg
Exemple #14
0
    def test_merge_sgs(self):
        a_gtf = 'input_files/annot.gtf'
        b_gtf = 'input_files/annot_2.gtf'
        sg = swan.SwanGraph()
        sg.add_dataset('a', a_gtf, include_isms=True)
        sg.add_dataset('b', b_gtf, include_isms=True)

        print(sg.loc_df.head())
        print(sg.edge_df.head())
        print(sg.t_df.head())

        # print(sg.loc_df[['chrom', 'coord', 'strand', 'a', 'b']])
        # print(sg.edge_df[['edge_type', 'a', 'b']])
        # print(sg.t_df[['path', 'a', 'b']])

        # check that the format of dfs are ok
        assert sg.loc_df.index.names == ['vertex_id']
        control = [
            'coord', 'chrom', 'strand', 'a', 'b', 'vertex_id', 'internal',
            'TSS', 'TES'
        ]
        test = sg.loc_df.columns.tolist()
        check_pairs(control, test)

        assert sg.edge_df.index.names == ['edge_id']
        control = ['v1', 'v2', 'edge_type', 'strand', 'a', 'b', 'edge_id']
        test = sg.edge_df.columns.tolist()
        check_pairs(control, test)

        assert sg.t_df.index.names == ['tid']
        control = ['tid', 'gid', 'gname', 'path', 'a', 'b']
        test = sg.t_df.columns.tolist()
        check_pairs(control, test)

        # test that loc_df merging happened correctly
        # query chr, coord, strand, a and b columns
        chrs = sg.loc_df['chrom'].tolist()
        control = [1, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 1, 1, 1, 1, 1, 4, 4]
        print('test chrs: ')
        print(chrs)
        print('control chrs: ')
        print(chrs)
        assert control == control

        coords = sg.loc_df['coord'].tolist()
        control = [
            1, 90, 100, 500, 600, 900, 1000, 1, 10, 15, 20, 2000, 1500, 1000,
            900, 800, 4000, 1000
        ]
        print('test coords: ')
        print(coords)
        print('control coords: ')
        print(control)
        assert coords == control

        strand = sg.loc_df['strand'].tolist()
        control = [
            '+', '+', '+', '+', '+', '+', '+', '+', '+', '+', '+', '-', '-',
            '-', '-', '-', '-', '-'
        ]
        print('test strands: ')
        print(strand)
        print('control strands: ')
        print(control)
        assert strand == control

        a = sg.loc_df['a'].tolist()
        control = [
            bool(i)
            for i in [1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]
        ]
        print('test a presence: ')
        print(a)
        print('control a presence: ')
        print(control)
        assert a == control

        b = sg.loc_df['b'].tolist()
        control = [
            bool(i)
            for i in [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0]
        ]
        print('test b presence: ')
        print(b)
        print('control b presence: ')
        print(control)
        assert b == control

        # test that edge_df merging and id mapping happened correctly
        # query edge_id, edge_type, a and b columns
        edge_id = sg.edge_df['edge_id'].tolist()
        control = [(0, 1), (0, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6),
                   (7, 8), (8, 9), (9, 10), (11, 12), (12, 13), (13, 14),
                   (13, 15), (16, 17)]
        print('test edge_ids: ')
        print(edge_id)
        print('control edge_ids')
        print(control)
        assert edge_id == control

        edge_type = sg.edge_df['edge_type'].tolist()
        control = [
            'exon', 'exon', 'intron', 'intron', 'exon', 'intron', 'exon',
            'exon', 'intron', 'exon', 'exon', 'intron', 'exon', 'exon', 'exon'
        ]
        print('test edge_types: ')
        print(edge_type)
        print('control edge_types: ')
        print(control)
        assert edge_type == control

        a = sg.edge_df['a'].tolist()
        control = [
            bool(i) for i in [0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1]
        ]
        print('test a presence: ')
        print(a)
        print('control a presence: ')
        print(control)
        assert a == control

        b = sg.edge_df['b'].tolist()
        control = [
            bool(i) for i in [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0]
        ]
        print('test b presence: ')
        print(b)
        print('control b presence: ')
        print(control)
        assert b == control

        # test that t_df merging and id mapping happened correctly
        # query tid, path, a and b columns
        tid = sg.t_df['tid'].tolist()
        control = ['ENST01', 'ENST02', 'ENST03', 'ENST04', 'ENST07', 'ENST08']
        print('test tids: ')
        print(tid)
        print('control tids: ')
        print(control)
        assert tid == control

        paths = [tuple(path) for path in sg.t_df['path'].tolist()]
        control = [(0, 2, 3, 4, 5, 6), (0, 1, 3, 4, 5, 6), (11, 12, 13, 14),
                   (11, 12, 13, 15), (16, 17), (7, 8, 9, 10)]
        print('test paths: ')
        print(paths)
        print('control paths: ')
        print(control)
        assert paths == control

        a = sg.t_df['a'].tolist()
        control = [bool(i) for i in [1, 0, 1, 0, 1, 0]]
        print('test a presence: ')
        print(a)
        print('control a presence: ')
        print(control)
        assert a == control

        b = sg.t_df['b'].tolist()
        control = [bool(i) for i in [1, 1, 0, 1, 0, 1]]
        print('test b presence: ')
        print(b)
        print('control b presence: ')
        print(control)
        assert b == control
Exemple #15
0
def process_gtf():
    sg = swan.SwanGraph()
    sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf')

    return sg
Exemple #16
0
import swan_vis as swan

ab_file = 'all_talon_abundance_filtered.tsv'
ref_gtf = '/Users/fairliereese/mortazavi_lab/ref/gencode.v29/gencode.v29.annotation.gtf'
hep_1_gtf = 'hepg2_1_talon.gtf'
hep_2_gtf = 'hepg2_2_talon.gtf'
hff_1_gtf = 'hffc6_1_talon.gtf'
hff_2_gtf = 'hffc6_2_talon.gtf'
hff_3_gtf = 'hffc6_3_talon.gtf'

# adding data to the swangraph
sg = swan.SwanGraph()
sg.add_annotation(ref_gtf)
sg.add_dataset('HepG2_1', hep_1_gtf, counts_file=ab_file, count_cols='hepg2_1')
sg.add_dataset('HepG2_2', hep_2_gtf, counts_file=ab_file, count_cols='hepg2_2')
sg.add_dataset('HFFc6_1', hff_1_gtf, counts_file=ab_file, count_cols='hffc6_1')
sg.add_dataset('HFFc6_2', hff_2_gtf, counts_file=ab_file, count_cols='hffc6_2')
sg.add_dataset('HFFc6_3', hff_3_gtf, counts_file=ab_file, count_cols='hffc6_3')
sg.save_graph('swan')
sg = swan.SwanGraph('swan.p')

# de gene and transcript tests
dataset_groups = [['HepG2_1', 'HepG2_2'], ['HFFc6_1', 'HFFc6_2', 'HFFc6_3']]
sg.de_gene_test(dataset_groups)
sg.de_transcript_test(dataset_groups)
de_gids, _ = sg.get_de_genes()
print('Found {} differentially expressed genes'.format(len(de_gids)))
de_tids, _ = sg.get_de_transcripts()
print('Found {} differentially expressed transcripts'.format(len(de_tids)))
is_gids, _ = sg.find_isoform_switching_genes()
print('Found {} isoform switching genes'.format(len(is_gids)))
Exemple #17
0
    def test_num_novel_known_isoforms(self):
        sg = swan.SwanGraph()
        sg.t_df = pd.DataFrame({
            'tid': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
            'gid': [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3],
            'annotation': [
                True, True, True, True, False, True, True, False, False, False,
                True, False, False, True, True, True
            ],
            'a': [
                False, True, True, False, True, False, False, True, True,
                False, False, True, False, False, False, False
            ],
            'b': [
                False, False, True, True, True, False, True, False, True, True,
                True, False, True, False, False, False
            ]
        })
        sg.t_df = swan.create_dupe_index(sg.t_df, 'tid')
        sg.t_df = swan.set_dupe_index(sg.t_df, 'tid')
        sg.datasets = ['annotation', 'a', 'b']

        genes, g_df = sg.find_genes_with_novel_isoforms()

        # check that the genes were returned in the right order
        print(genes)
        control = [2, 1, 0]
        assert genes == control

        # check that gene 3 didn't get included
        g_df_genes = g_df.index.tolist()
        print(g_df_genes)
        control = [0, 1, 2]
        check_pairs(control, g_df_genes)

        # check that sum of known/novel models is correct
        known_models = g_df.loc[0, 'known']
        print('gene 0 num known:')
        print(known_models)
        known_control = 3
        novel_models = g_df.loc[0, 'novel']
        print('gene 0 num novel:')
        print(novel_models)
        novel_control = 0
        assert known_models == known_control
        assert novel_models == novel_control

        known_models = g_df.loc[1, 'known']
        print('gene 1 num known:')
        print(known_models)
        known_control = 1
        novel_models = g_df.loc[1, 'novel']
        print('gene 1 num novel:')
        print(novel_models)
        novel_control = 1
        assert known_models == known_control
        assert novel_models == novel_control

        known_models = g_df.loc[2, 'known']
        print('gene 2 num known:')
        print(known_models)
        known_control = 1
        novel_models = g_df.loc[2, 'novel']
        print('gene 2 num novel:')
        print(novel_models)
        novel_control = 5
        assert known_models == known_control
        assert novel_models == novel_control

        # make sure that when we're trying this w/o an annotation,
        # we raise the error
        sg.t_df.drop('annotation', axis=1, inplace=True)
        sg.datasets.remove('annotation')
        with pytest.raises(Exception) as excinfo:
            genes, g_df = sg.find_genes_with_novel_isoforms()
        assert 'No annotation data' in str(excinfo.value)
Exemple #18
0
def get_dummy_merge_sgs():
    a = swan.SwanGraph()
    b = swan.SwanGraph()

    a.loc_df = pd.DataFrame({
        'chrom': [1, 1, 1, 2],
        'coord': [1, 2, 3, 1],
        'strand': ['+', '+', '+', '+'],
        'vertex_id': [0, 1, 2, 3]
    })
    b.loc_df = pd.DataFrame({
        'chrom': [1, 1, 1, 2, 3],
        'coord': [1, 2, 4, 1, 1],
        'strand': ['+', '+', '+', '-', '+'],
        'vertex_id': [1, 2, 3, 4, 5]
    })
    a.loc_df = swan.create_dupe_index(a.loc_df, 'vertex_id')
    b.loc_df = swan.create_dupe_index(b.loc_df, 'vertex_id')
    a.loc_df = swan.set_dupe_index(a.loc_df, 'vertex_id')
    b.loc_df = swan.set_dupe_index(b.loc_df, 'vertex_id')

    a.edge_df = pd.DataFrame({
        'edge_id': [(0, 1), (1, 2), (0, 2), (2, 3)],
        'v1': [0, 1, 0, 2],
        'v2': [1, 2, 2, 3],
        'edge_type': ['exon', 'intron', 'exon', 'exon'],
        'strand': ['+', '+', '+', '+']
    })
    b.edge_df = pd.DataFrame({
        'edge_id': [(1, 2), (1, 3), (2, 4), (3, 4)],
        'v1': [1, 1, 2, 3],
        'v2': [2, 3, 4, 4],
        'strand': ['+', '+', '+', '+'],
        'edge_type': ['exon', 'exon', 'intron', 'intron']
    })
    a.edge_df = swan.create_dupe_index(a.edge_df, 'edge_id')
    b.edge_df = swan.create_dupe_index(b.edge_df, 'edge_id')
    a.edge_df = swan.set_dupe_index(a.edge_df, 'edge_id')
    b.edge_df = swan.set_dupe_index(b.edge_df, 'edge_id')

    a.t_df = pd.DataFrame({
        'tid': [0, 1, 3],
        'gid': [0, 0, 0],
        'gname': ['0', '0', '0'],
        'path': [[0, 1], [0, 1, 2, 3], [0, 2]]
    })
    b.t_df = pd.DataFrame({
        'tid': [0, 2, 4],
        'gid': [0, 0, 0],
        'gname': ['0', '0', '0'],
        'path': [[1, 2], [1, 2, 4], [1, 2, 3]]
    })
    a.t_df = swan.create_dupe_index(a.t_df, 'tid')
    b.t_df = swan.create_dupe_index(b.t_df, 'tid')
    a.t_df = swan.set_dupe_index(a.t_df, 'tid')
    b.t_df = swan.set_dupe_index(b.t_df, 'tid')

    # add 'dataset a' to a
    a.datasets = ['a']
    a.loc_df['a'] = True
    a.edge_df['a'] = True
    a.t_df['a'] = True

    a.get_loc_types()
    b.get_loc_types()

    return a, b
Exemple #19
0
### Getting Started

import swan_vis as swan

annot_gtf = 'data/gencode.v29.annotation.gtf'
hep_1_gtf = 'data/hepg2_1_talon.gtf'
hep_2_gtf = 'data/hepg2_2_talon.gtf'
hff_1_gtf = 'data/hffc6_1_talon.gtf'
hff_2_gtf = 'data/hffc6_2_talon.gtf'
hff_3_gtf = 'data/hffc6_3_talon.gtf'
ab_file = 'data/all_talon_abundance_filtered.tsv'
talon_db = 'data/talon.db'

# initialize a new SwanGraph
sg = swan.SwanGraph()

# add an annotation transcriptome
sg.add_annotation(annot_gtf)

# add a dataset's transcriptome and abundance information to
# the SwanGraph
sg.add_dataset('HepG2_1', hep_1_gtf, counts_file=ab_file, count_cols='hepg2_1')
sg.add_dataset('HepG2_2', hep_2_gtf, counts_file=ab_file, count_cols='hepg2_2')
sg.add_dataset('HFFc6_1', hff_1_gtf, counts_file=ab_file, count_cols='hffc6_1')
sg.add_dataset('HFFc6_2', hff_2_gtf, counts_file=ab_file, count_cols='hffc6_2')
sg.add_dataset('HFFc6_3', hff_3_gtf, counts_file=ab_file, count_cols='hffc6_3')

# save the SwanGraph as a Python pickle file
sg.save_graph('swan')

### Analysis