def calculate_perturbation_factor(self,
                                      experiment,
                                      gene,
                                      pathway,
                                      visited=None):

        visited = [] if not visited else visited

        pf = 0
        if len(set(gene.split(',')) & set(self.experiment_genes)) != 0:
            for name in gene.split(','):
                if name.strip() in self.experiment_genes:
                    # get ΔE
                    pf = self.FC['FC'][Gene(name)] if not isnan(
                        self.FC['FC'][Gene(name)]) else MAX_IF
                    break

        # genes directly upstream
        for edge in pathway.in_edges(gene):
            if edge[0] not in visited:
                beta = mean([
                    interaction_weights[t]
                    if t in interaction_weights.keys() else 0
                    for t in get_edge_attributes(pathway, 'type')[edge]
                ])
                # genes directly downstream
                dstream = len(pathway.out_edges(edge[0]))
                pf += self.calculate_perturbation_factor(
                    experiment, edge[0], pathway,
                    visited + [edge[1]]) * beta / dstream
        return pf
Beispiel #2
0
def test_file_with_description(test_files, tmpdir):

    create_files(
        tmpdir, {
            'control_with_descriptions.tsv': (
                'Gene	Description	Control_1	Control_2',
                'TP53	Tumour protein 53	6	6',
                'BRCA2	Breast cancer type 2 s. protein	6	7',
            )
        })

    expected_warning = (
        'First line of your file contains "description" column '
        'but you did not provide "--description_column" argument.')

    # user forgot
    with pytest.warns(UserWarning, match=expected_warning):
        opts = p_parse('case t.tsv control control_with_descriptions.tsv')
        assert len(opts.control.sample_collection.samples) == 3

    # user remembered
    opts = p_parse('case t.tsv control control_with_descriptions.tsv -d')
    assert len(opts.control.sample_collection.samples) == 2

    assert set(opts.control.sample_collection.samples[0].genes) == {
        Gene('TP53', description='Tumour protein 53'),
        Gene('BRCA2', description='Breast cancer type 2 s. protein')
    }
def create_test_models():
    protein = Protein(refseq='NM_0001',
                      gene=Gene(name='SOMEGENE'),
                      sequence='ABCD')
    mutation = Mutation(protein=protein, position=1, alt='E')
    protein.gene.preferred_isoform = protein

    MC3Mutation(mutation=mutation,
                cancer=Cancer(code='CAN'),
                samples='Sample A,Sample B',
                count=2)
    InheritedMutation(mutation=mutation,
                      clin_data=[
                          ClinicalData(disease=Disease(name='Some disease'),
                                       sig_code=5),
                          ClinicalData(disease=Disease(name='Other disease'),
                                       sig_code=2)
                      ])

    protein_kinase = Protein(refseq='NM_0002',
                             gene=Gene(name='OTHERGENE'),
                             sequence='ABCD')
    kinase = Kinase(name='Kinase name', protein=protein_kinase)
    site = Site(protein=protein,
                position=1,
                residue='A',
                kinases={kinase},
                pmid={1, 2},
                types={SiteType(name='glycosylation')})
    protein.sites = [site]

    return locals()
def test_sample_init():
    genes = {Gene('BAD'): 1.2345, Gene('FUCA2'): 6.5432}

    sample = Sample('Tumour_1', genes)

    assert sample.name == 'Tumour_1'
    assert all(isinstance(k, Gene) for k in sample.data.keys())
    assert sample.data == genes
Beispiel #5
0
    def test_mapping(self):

        gene_a = Gene(
            name='A',
            isoforms=[
                # the full isoform of gene A
                Protein(refseq='NM_01', sequence='AAAAAAAAAXAA'),
                # a trimmed isoform of gene A
                Protein(refseq='NM_02', sequence='AAAXAA'),
            ])
        gene_b = Gene(name='B',
                      isoforms=[
                          Protein(refseq='NM_03', sequence='BBBBBBBBBYBB'),
                          Protein(refseq='NM_04', sequence='BBBYBB'),
                      ])
        db.session.add_all([gene_a, gene_b])

        # whoops, NM_03 has be accidentally removed (!)
        db.session.delete(Protein.query.filter_by(refseq='NM_03').one())
        db.session.commit()

        mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'),
                            lambda s: f'{s.position}{s.residue}')

        sites = DataFrame.from_dict(data={
            'good site A': ('A', 'NM_01', 10, 'AXA', 'X', 1),
            'lost isoform': ('B', 'NM_03', 10, 'BYB', 'Y', 1)
        },
                                    orient='index')

        sites.columns = [
            'gene', 'refseq', 'position', 'sequence', 'residue',
            'left_sequence_offset'
        ]

        mapped_sites = mapper.map_sites_by_sequence(sites)

        sites_by_isoform = group_by_isoform(mapped_sites)

        # one from NM_01 (defined), from NM_02 (mapped), from NM_04 (mapped)
        assert len(mapped_sites) == 3
        assert set(sites_by_isoform) == {'NM_01', 'NM_02', 'NM_04'}

        assert sites_by_isoform['NM_01'].residue == sites_by_isoform[
            'NM_02'].residue == 'X'
        assert sites_by_isoform['NM_01'].position == 10
        assert sites_by_isoform['NM_02'].position == 4

        assert sites_by_isoform['NM_04'].residue == 'Y'
        assert sites_by_isoform['NM_04'].position == 4

        # will the mapping to NM_02 still work if we remove 'gene' column?
        sites.drop(columns=['gene'], inplace=True)
        mapped_sites = mapper.map_sites_by_sequence(sites)
        sites_by_isoform = group_by_isoform(mapped_sites)

        assert len(mapped_sites) == 2
        assert set(sites_by_isoform) == {'NM_01', 'NM_02'}
Beispiel #6
0
def minimal_data():

    tp53 = Gene('TP53')
    map2k1 = Gene('MAP2K1')

    case = SampleCollection('case', [Sample('1', {tp53: 2, map2k1: 1})])
    control = SampleCollection('control', [Sample('1', {tp53: 1, map2k1: 1})])

    return tp53, map2k1, case, control
Beispiel #7
0
def test_init():
    genes1 = {Gene('BAD'): 1.2345, Gene('FUCA2'): 6.5432}
    genes2 = {Gene('BAD'): 2.3456, Gene('FUCA2'): 7.6543}

    samples = [Sample('Tumour_1', genes1), Sample('Tumour_2', genes2)]

    sample_collection = SampleCollection('Tumour', samples)

    assert sample_collection.name == 'Tumour'
    assert all(isinstance(k, Sample) for k in sample_collection.samples)
Beispiel #8
0
    def test_train_model(self):

        phosphorylation = SiteType(name='phosphorylation')

        # non-phosphorylated serine residues are needed to generate negative sites
        p = Protein(refseq='NM_007',
                    sequence='--------SLPA-----------SVIT-------')
        g = Gene(isoforms=[p], preferred_isoform=p)
        db.session.add(g)

        # phosphorylated, with sites
        p = Protein(refseq='NM_001',
                    sequence='--------SPAK-----------SPAR-------')
        g = Gene(isoforms=[p], preferred_isoform=p)
        db.session.add(g)

        k = Kinase(name='CDK1', is_involved_in={phosphorylation})

        for pos in [9, 24]:
            s = Site(position=pos,
                     types={phosphorylation},
                     residue='S',
                     protein=p,
                     kinases={k})
            db.session.add(s)

        db.session.commit()

        with TemporaryDirectory() as temp_dir:
            model = train_model(phosphorylation,
                                sequences_dir=temp_dir,
                                sampling_n=2,
                                threshold=2)

        # the model should have one set of params - for CDK1 kinase
        assert len(model) == 1

        cdk_params = model.rx2('CDK1')
        pwm = cdk_params.rx2('pwm')

        # and the position-specific weight matrix should be created
        assert pwm

        # the very detailed testing should be performed by rMIMP,
        # but why not test the basics?

        weights_of_central_aa = {
            aa: value
            for aa, value in zip(pwm.rownames, pwm.rx(True, 8))
        }
        assert weights_of_central_aa['S'] == max(
            weights_of_central_aa.values())
def test_gene_init():
    gene = Gene('BAD')
    assert gene.name == 'BAD'

    same = Gene('BAD')
    assert same == gene
    assert same is gene

    g = Gene('TP53', 'Tumour suppressor p53')
    assert g.name == 'TP53'
    assert g.description == 'Tumour suppressor p53'

    from copy import copy
    assert copy(g).id is g.id
    def test_search_proteins(self):
        from views.search import search_proteins

        # create 15 genes and proteins
        mock_proteins_and_genes(15)

        # control: do we start with the mocked proteins not others?
        assert not search_proteins('TP53')

        # does respect limit? does symbol search work?
        results = search_proteins('Gene', 10)

        assert results
        assert len(results) == 10

        assert results[0].name.startswith('Gene')

        # are results sorted?
        db.session.add_all([
            Gene(name=name, preferred_isoform=Protein(refseq='NM_%s' % 20 * i))
            for i, name in enumerate(['TPK', 'TPKK'])
        ])
        results = search_proteins('TPK', 2)
        assert results[0].name == 'TPK'
        assert results[0].best_score < results[1].best_score

        # does include both: refseq and symbol search?
        assert search_proteins('NM_0003', 1)

        # can we change subset of searched features?
        assert not search_proteins('NM_0003', 1, features=['gene_symbol'])
        assert not search_proteins('Gene', 1, features=['refseq'])
Beispiel #11
0
def genes():
    if request.method == "GET":
        genes = Gene.query.all()
        return render_template('display_genes.html', genes=genes)
    elif request.method == "POST":
        description = request.form.get("description")
        dna_sequence = request.form.get("dna_sequence")
        created_by = request.form.get("created_by")
        creation_date = request.form.get("creation_date")
        notes = request.form.get("notes")
        files = request.files.getlist('files[]')
        snapgene_files = ""
        for file in files:
            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename)
                snapgene_files += filename + ","
                file.save(
                    os.path.join(app.config['UPLOAD_FOLDER'], "plasmids",
                                 filename))
        contains_genes = request.form.get("contains_genes")
        new_gene = Gene(description, dna_sequence, created_by, creation_date,
                        notes, snapgene_files)
        db.session.add(new_gene)
        db.session.commit()
        flash("Success!")
        return redirect("/")
def create_test_data():

    mappings_filename = make_named_gz_file(raw_mappings)

    # create proteins from first three data rows
    protein_data = [
        ('NM_002749',
         'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*'
         ),
        ('NM_139033',
         'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*'
         ),
        ('NM_139034',
         'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*'
         ),
    ]

    gene = Gene(name='MAPK7')

    proteins = {
        refseq_nm: Protein(refseq=refseq_nm, sequence=sequence, gene=gene)
        for refseq_nm, sequence in protein_data
    }

    # we need to have proteins with id in session - hence commit
    db.session.add(gene)
    db.session.commit()

    return mappings_filename, gene, proteins
    def test_remove_unwanted_proteins(self):

        sequences = {
            'NM_1': 'MAKS*',  # all right
            'NM_2': 'MKFR',  # lack of stop codon
            'NM_3': 'MK*A',  # premature stop codon
        }

        proteins = create_test_proteins(['NM_1', 'NM_2', 'NM_3'])

        gene = Gene(isoforms=list(proteins.values()),
                    preferred_isoform=proteins['NM_3'])

        db.session.add_all(proteins.values())
        db.session.add(gene)

        for fake_refseq, protein in proteins.items():
            protein.sequence = sequences[fake_refseq]

        clean_from_wrong_proteins()

        # NM_2 and NM_3 should be removed, NM_1 should still be there
        assert len(proteins) == 1
        assert 'NM_1' in proteins

        for refseq in ['NM_2', 'NM_3']:
            assert refseq not in proteins

        # NM_1 should become a preferred isoform of gene (in place of NM_3)
        assert gene.preferred_refseq == 'NM_1'
    def test_select_preferred_isoform(self):
        proteins_data = [
            ('NM_001', 'MA', False),
            ('NM_002', 'MAA', True),
            ('NM_003', 'MAAA', True),  # we want this one:
            # canonical according to uniprot, then longest, then oldest in refseq
            ('NM_004', 'MAAA', True),
            ('NM_005', 'MAAAA', False)
        ]

        preferred_refseq = 'NM_003'

        gene = Gene(name='Gene X')
        for refseq, seq, is_uniprot_canonical in proteins_data:
            protein = Protein(refseq=refseq, sequence=seq, gene=gene)
            if is_uniprot_canonical:
                protein_references = ProteinReferences(
                    uniprot_entries=[UniprotEntry(isoform=1, reviewed=True)])
                protein.external_references = protein_references

        db.session.add(gene)

        isoform = select_preferred_isoform(gene)
        assert isoform
        assert isoform.refseq == preferred_refseq
Beispiel #15
0
    def test_edge_cases_mapping(self):

        gene_t = Gene(
            name='T',
            isoforms=[
                #                                 123456789
                Protein(refseq='NM_01', sequence='AXAXAYAYA'),
                # C-terminal part was trimmed
                Protein(refseq='NM_02', sequence='AXAXA'),
                # N-terminal part was trimmed
                Protein(refseq='NM_03', sequence='AYAYA'),
            ])
        db.session.add(gene_t)
        db.session.commit()

        mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'),
                            lambda s: f'{s.position}{s.residue}')

        # all sites in NM_01, the idea is to test
        sites = DataFrame.from_dict(data={
            'site at N-terminus edge': ('T', 'NM_01', 1, '^AX', 'A', 2),
            'site at C-terminus edge': ('T', 'NM_01', 9, 'YA$', 'A', 2),
        },
                                    orient='index')

        sites.columns = [
            'gene', 'refseq', 'position', 'sequence', 'residue',
            'left_sequence_offset'
        ]

        mapped_sites = mapper.map_sites_by_sequence(sites)

        assert len(mapped_sites) == 4
    def test_prepare_dataset(self):
        from views.mutation import prepare_datasets

        p = Protein(refseq='NM_000123',
                    sequence='TRAN',
                    gene=Gene(name='TP53'))
        mutation = Mutation(protein=p, position=2, alt='K')
        details = MC3Mutation(mutation=mutation, count=2)

        db.session.add(mutation)

        datasets, user_datasets = prepare_datasets(mutation)

        expected_datasets = [{
            'filter': 'Mutation.sources:in:' + source.name,
            'name': source.display_name,
            'mutation_present': False
        } if source is not MC3Mutation else {
            'filter': 'Mutation.sources:in:' + MC3Mutation.name,
            'name': MC3Mutation.display_name,
            'mutation_present': [details]
        } for source in source_manager.confirmed]

        assert datasets == expected_datasets
        assert not user_datasets
Beispiel #17
0
    def _set_genes(self, gene_ids):
        if gene_ids is None:
            return
        conn = Pgsql.Common.connect()
        cnt = 0
        for gnid in gene_ids:
            # get sequence information
            #gene_seq_pep_min = GeneSequence(gnid=gnid, seq_type=self.seq_type, is_max_seq_len=False, conn=conn)
            gene_seq_pep_min = None
            gene_seq_pep_max = GeneSequence(gnid=gnid,
                                            seq_type=self.seq_type,
                                            is_max_seq_len=True,
                                            conn=conn,
                                            k=self.k)
            gene_seq_dna_max = None

            # set Gene instance for each gnid
            gene = Gene(gnid=gnid,
                        pep_seq_min=gene_seq_pep_min,
                        pep_seq_max=gene_seq_pep_max,
                        dna_seq=gene_seq_dna_max)

            self.genes[gnid] = gene
            cnt += 1

            # for test
            if cnt % 100 == 0:
                print('%s gene has been added.' % gnid)

        conn.close()  # for single adding
Beispiel #18
0
def mock_proteins_and_genes(count):
    from database import db
    from models import Gene, Protein
    for i in range(count):
        g = Gene(name='Gene_%s' % i, full_name='Full name of gene %s' % i)
        p = Protein(refseq='NM_000%s' % i, gene=g)
        g.preferred_isoform = p
        db.session.add(g)
Beispiel #19
0
    def test_domains(self):
        proteins = [
            Protein(
                refseq='NM_018163',
                sequence=
                'MAVTKELLQMDLYALLGIEEKAADKEVKKAYRQKALSCHPDKNPDNPRAAELFHQLSQALEVLTDAAARAAYDKVRKAKKQAAERTQKLDEKRKKVKLDLEARERQAQAQESEEEEESRSTRTLEQEIERLREEGSRQLEEQQRLIREQIRQERDQRLRGKAENTEGQGTPKLKLKWKCKKEDESKGGYSKDVLLRLLQKYGEVLNLVLSSKKPGTAVVEFATVKAAELAVQNEVGLVDNPLKISWLEGQPQDAVGRSHSGLSKGSVLSERDYESLVMMRMRQAAERQQLIARMQQEDQEGPPT*',
                gene=Gene(chrom='15')),
            Protein(
                refseq='NM_004671',
                sequence=
                'MADFEELRNMVSSFRVSELQVLLGFAGRNKSGRKHDLLMRALHLLKSGCSPAVQIKIRELYRRRYPRTLEGLSDLSTIKSSVFSLDGGSSPVEPDLAVAGIHSLPSTSVTPHSPSSPVGSVLLQDTKPTFEMQQPSPPIPPVHPDVQLKNLPFYDVLDVLIKPTSLVQSSIQRFQEKFFIFALTPQQVREICISRDFLPGGRRDYTVQVQLRLCLAETSCPQEDNYPNSLCIKVNGKLFPLPGYAPPPKNGIEQKRPGRPLNITSLVRLSSAVPNQISISWASEIGKNYSMSVYLVRQLTSAMLLQRLKMKGIRNPDHSRALIKEKLTADPDSEIATTSLRVSLMCPLGKMRLTIPCRAVTCTHLQCFDAALYLQMNEKKPTWICPVCDKKAAYESLILDGLFMEILNDCSDVDEIKFQEDGSWCPMRPKKEAMKVSSQPCTKIESSSVLSKPCSVTVASEASKKKVDVIDLTIESSSDEEEDPPAKRKCIFMSETQSSPTKGVLMYQPSSVRVPSVTSVDPAAIPPSLTDYSVPFHHTPISSMSSDLPGLDFLSLIPVDPQYCPPMFLDSLTSPLTASSTSVTTTSSHESSTHVSSSSSRSETGVITSSGSNIPDIISLD*',
                gene=Gene(chrom='18'))
        ]

        db.session.add_all(proteins)

        filename = make_named_temp_file(domains_data)

        new_domains = load_domains(filename)

        assert len(new_domains) == 6
        assert len(proteins[0].domains) == 2

        domains = defaultdict(list)

        for domain in proteins[1].domains:
            domains[domain.interpro.short_description].append(domain)

        def assert_ranges(domain, start, end):
            assert domain.start == start and domain.end == end

        # two SAP domains should be merged for representation purposes due to similarity criteria
        # (these two domain annotation overlap so the smaller one is contained in the bigger)
        sap_domain = domains['SAP_dom'][0]
        assert_ranges(sap_domain, 1, 65)

        intepro_domain = sap_domain.interpro
        assert intepro_domain.accession == 'IPR003034'
        assert intepro_domain.description == 'SAP domain'

        # here the two annotations overlap with more than 75% of common
        assert_ranges(domains['PINIT'][0], 141, 299)

        # and here overlap was too small to merge those domains
        assert len(domains['Znf_MIZ']) == 2
    def test_refseq(self):
        from search.gene import RefseqGeneSearch

        # create 15 genes and proteins
        mock_proteins_and_genes(10)

        search = RefseqGeneSearch().search

        # negative control
        for phase in ['9999', 'NM_00000', 'Gene']:
            assert not search(phase)

        # limiting
        results = search('NM_', limit=5)
        assert len(results) == 5

        assert results[0].name.startswith('Gene')

        # test the search itself
        for refseq in ['NM_0003', 'nm_0003', '0003']:
            results = search(refseq)
            assert len(results) == 1
            assert results[0].name == 'Gene_3'

            isoforms = results[0].matched_isoforms
            assert len(isoforms) == 1
            assert isoforms.pop().refseq == 'NM_0003'

        db.session.add_all([
            Gene(name='Gene X',
                 isoforms=[
                     Protein(refseq='NM_000301'),
                     Protein(refseq='NM_000302'),
                 ]),
            Gene(name='Gene Y', isoforms=[Protein(refseq='NM_000309')])
        ])

        # so there are three genes with isoforms starting with NM_0003
        # (those are Gene_3, Gene X, Gene Y). Let see if limiting work
        # well when applied per-gene.

        queries = {'NM_0003': 2, 'NM_00030': 2, 'NM_000301': 1, 'NM_000302': 1}

        for query, expected_result in queries.items():
            assert len(search(query, limit=2)) == expected_result
Beispiel #21
0
def get_gene_chrom(conn, gene_id):
    """return chromsome which the gene is on
    """
    query = select([gene]).where(gene.c.id == gene_id)
    results = conn.execute(query)
    result = next(results, None)
    if not result:
        return None
    return Gene(*result).chrom
Beispiel #22
0
    def test_show(self):

        g = Gene(**test_gene_data)
        db.session.add(g)

        response = self.client.get('/gene/show/BRCA1')

        assert response.status_code == 200
        assert b'BRCA1' in response.data
        assert b'NM_000123' in response.data
Beispiel #23
0
def gene_from_isoforms(all_proteins, chosen_isoforms):
    """Just for testing: in normal settings the bi-directional initialization is performed required"""
    isoforms = [
        protein for refseq, protein in all_proteins.items()
        if refseq in chosen_isoforms
    ]
    gene = Gene(isoforms=isoforms)
    for isoform in isoforms:
        isoform.gene = gene
    return gene
Beispiel #24
0
    def test_gene_full_name(self):

        gene = Gene(name='TP53', entrez_id=7157)
        db.session.add(gene)

        filename = make_named_temp_file(full_gene_names, mode='wt', opener=gzip.open)

        load_full_gene_names(filename)

        assert gene.full_name == 'tumor protein p53'
Beispiel #25
0
def get_generator(latent_dim, input_dim, ngf):
    
    # Initialise generator
    G = Gene(latent_dim, output_dim = input_dim, ngf = ngf)

    G.apply(weights_init)

    G = G.to(device)

    return G
Beispiel #26
0
def create_test_kinase(name, refseq):

    interactor = Kinase(name=name)

    kinase_gene = Gene(name='Gene of ' + interactor.name)
    kinase_protein = Protein(refseq=refseq, gene=kinase_gene)

    interactor.protein = kinase_protein

    return interactor
    def test_conservation(self):

        proteins = create_test_proteins(['NM_002749', 'NM_000600'])
        proteins['NM_002749'].gene = Gene(name='MAPK7', chrom='17')
        proteins['NM_000600'].gene = Gene(name='IL6', chrom='7')

        for refseq, protein in proteins.items():
            protein.sequence = sequences_for_proteins[refseq]

        # Generated with:
        """
            import pyBigWig
            test_data = test_data.loc[('chr17', 'NM_002749')].iloc[0]
            full_bw = pyBigWig.open('data/hg19.100way.phyloP100way.bw')
            test_bw = pyBigWig.open('chr17_NM_002749.test_data.bw', 'w')
            needed_chrom_length = max(max(test_data.exonStarts), max(test_data.exonEnds))
            test_bw.addHeader([('chr17', needed_chrom_length)])
            coordinates = sorted(zip(test_data.exonStarts, test_data.exonEnds), key=lambda x: x[0])
            for start, end in zip(test_data.exonStarts, test_data.exonEnds):
                values = full_bw.values('chr17', start, end)
                for i, value in enumerate(values):
                    test_bw.addEntries(['chr17'], starts=[start + i], ends=[start + i + 1], values=[value])
            test_bw.close()
        """
        conservation_big_wig = 'tests/test_imports/chr17_NM_002749.test_data.bw'

        gene_coordinates = make_named_temp_file(coordinates_data)
        load_conservation(conservation_big_wig, gene_coordinates)

        # Protein.query.filter_by(refseq='NM_002749').one().conservation
        assert proteins[
            'NM_002749'].conservation == '3.47;1.67;2.95;1.23;.9;1.64;4.08;1.72;1.25;1.15;2.26;1.69;1.03;1.05;2.39;1.52;2.88;.5;2.28;-.09;2.24;1.32;-.06;.59;1.2;-.13;.37;.76;.04;1.26;-.2;1.84;.97;2.95;5.67;3.98;2.91;4.5;2.33;3.17;4.66;3.73;4.24;4.01;2.49;4.35;5.29;2.95;4.67;5.66;5.36;5.25;4.23;4.87;5.31;5.18;3.52;4.53;5.59;3.55;5.05;6.65;2.01;5.75;5.06;4.88;5.9;4.9;5.63;3.32;3.52;5.09;4.04;4.24;2.24;1.84;2.13;6.12;6.54;2.75;6.08;5.79;5.7;8.26;6.81;5.82;3.83;4.47;3.08;5.11;5;3.4;2.52;4.67;3.97;3.58;4.87;3.72;5.33;4.48;3.92;6.79;3.7;7.52;5.46;4.05;3.07;4.02;5.47;5.31;5.12;7.11;6.59;6.09;5.37;4.73;5.6;6.3;6.52;5.96;3.32;2.84;4.06;.53;3.5;3.79;3.11;1.81;4.51;4.81;3.35;4.29;5.34;5.31;4.63;6.41;2.39;6.46;3.16;8.2;6.23;2.67;6.63;2.39;4.07;5;3.49;4.71;4.16;3.27;.7;6.43;1.1;3.25;2.31;1.57;6.39;5.62;2.68;4.33;6.16;6;3.28;6.16;6.66;3.66;3.77;4.11;6.52;3.69;6.55;5.97;5.28;5.9;3.63;6.16;5.36;5.92;4.24;6.23;4.27;7.07;5.64;6.62;5.32;1.79;6.59;3.26;4.38;3.92;6.2;3.59;5.22;5.76;6.77;4.58;7.52;4.24;3.91;5.75;5.28;6.51;7.43;5.49;2.98;5.48;2.01;2.62;2.19;1.95;1.01;2.03;5.55;1.47;2.2;2.38;5.52;5.93;3.39;6.62;5.51;6.67;5.67;3.9;6.03;8.99;6.67;4.17;4.7;4.01;6.27;3.7;5.06;3.66;2.06;2.64;3.76;2.56;5.94;3.82;2.89;4.05;5.17;6.75;3.05;8.99;4.25;5.57;6.66;6.22;6.06;3.65;4.13;6.66;8.76;2.77;2.34;1.65;1.84;3.91;3.38;5.3;3.49;6.22;3.21;3.91;5.43;2.8;4.28;6.44;3.25;3.89;2.56;5.75;4.58;1.02;5.09;3.53;5.75;5;3.97;3.24;2.08;.17;3.51;2.96;1.66;2.99;3.71;4.75;3.78;5.8;3.71;5.3;4.16;3.18;4.66;3.56;6.62;4.69;2.96;4.27;3.1;3.49;2.38;2.54;3.18;4.54;5.17;3.34;.46;4.2;3.77;2.53;1.27;3.6;4.46;.83;1.58;4.57;2.92;2.17;2.81;3.17;1.48;1.13;8.45;2.85;1.15;4.09;5.67;3.85;.82;1.15;3.3;3.95;.21;4.67;1.19;2.83;5.43;3.54;.76;5.35;3.81;3.85;3.23;2.46;5.65;5.76;3.97;6.32;3.35;2.86;5.95;6.63;4.5;2.89;5.45;3.95;4.59;4.34;5.09;4.99;5.24;4.02;4.91;5.83;1.59;2.45;1.71;2.51;2.87;2.93;6;3.07;4.58;5.09;5.26;4.44;4.06;3.28;2.1;6.69;4.55;2.58;5.57;4.79;3.75;.58;1.99;2.45;4.16;3.64;1.55;3.8;2.09;3.71;4.36;2.27;4.31;4.26;3.43;1.01;.44;1.57;1.7;2.68;1.46;1.5;1.7;.7;.92;1.94;1.42;2.39;.76;3.24;3.14;1.01;2.85;1.17;1.48;1.22;.67;.8;.75;3.02;1.12;1.34;4.82;3.73;1.26;.92;-.86;1.13;.65;1.18;.06;-.67;.8;.2;.41;.35;.05;.69;1.11;1.09;.06;1.29;2.34;.9;-.06;.36;1.24;-.18;.06;-.14;.87;.86;2.37;1.38;.52;1.97;.97;4.2;1.48;4.38;5.02;3.81;4.14;3.29;4.98;4.21;4.73;5.01;2.84;4.03;4.01;4.89;4.64;2.37;1.66;2.52;5.89;3.6;1.53;1.1;2.19;1.25;1;2.33;2.53;1.89;2.01;1;.84;-.21;.93;2.4;2.06;1.91;4.48;.3;3.03;2.21;2.16;4.76;4.08;4.42;4.07;5.72;3.13;3.52;2.69;5.1;3.98;3.99;5.24;5.81;2.02;1.75;3.18;2.96;4.36;4.26;3.2;3.71;1.85;4.88;2.93;4.94;4.22;2.17;1.67;2.44;4.68;1.22;3.37;1.68;1.73;2.69;1.65;.75;-.94;1.27;.11;-.79;3;1.26;.57;1.71;.45;2.93;1.48;1.67;2.62;3.54;.95;3.03;2.33;2.5;3.92;2.41;3.3;2.08;2.16;.87;1.26;3.18;3.02;4.04;1.48;.78;4.1;2.06;.59;1.89;.25;.44;-.23;-.57;-.45;.33;-.12;.22;-1.71;.1;.63;.58;.04;-.21;.5;-2.18;.06;.06;-.46;-.39;-.23;.26;.36;.42;.86;1.85;.56;.27;-.51;.65;.42;-.35;.43;.93;.05;-.66;.28;.09;.41;-.05;.46;.51;.02;.67;.11;-.55;.27;.09;.77;-.18;.11;2.34;.99;.1;.78;.52;1.23;.3;.04;.31;-.42;.13;.78;1.01;-.48;.98;.14;.65;.25;.97;-.67;.8;.49;1.15;.15;-1.05;.52;1.43;.04;.68;-.14;-.15;.24;.73;1.71;.89;-.05;.68;.53;.1;.52;.81;1.56;.28;.07;2.18;.55;1.24;2.33;.78;-.04;.77;-.09;.4;.61;.96;1.59;1.57;1.71;1.52;1.41;1.11;1.48;.52;1.61;.76;2.41;.05;.73;-.38;.4;1.78;.95;.46;2.29;1.05;1.94;1.74;.88;1.32;1.97;1.66;1.64;1.17;2.12;2.38;2.63;2.64;1.63;2.49;3.23;1.49;4.21;5.62;5.72;5.3;.78;3.01;4.23;3.16;3.53;5.02;3.73;4.38;4.53;4.4;6.09;6.37;5.1;5.65;4.64;5.8;4.94;5.24;4.63;6.08;4.74;4.27;4.1;5.34;5.96;3.64;2.03;3.29;5.31;2.6;2.34;2.25;4.26;2.1;2.08;2.48;.6;1.53;.31;1.96;4.53;3.8;3.27;.67;5.5;2.33;4.89;2.31;3.4;4.37;3.71;4.24;2.89;3.27;2.97;5.14;7.61;2.14;5.73;3.45;4.27;3.13;5.77;4.91;2.21;2.5;4.74;3.57;4.6;3.43;3.41;6.01;3.21;5.22;3.11;6.33;4.8;4.29;5.23;3.28;6.47;2.48;6.01;3.17;5.92;1.38;1.79;2.96;2.67;2.21;2.67;1.89'

        # no data for this one, lets see if the pipeline handles such cases well
        assert proteins['NM_000600'].conservation is None

        db.session.add_all(proteins.values())
        db.session.commit()

        assert Protein.query.filter_by(
            refseq='NM_000600').one().conservation == ''
Beispiel #28
0
def create_test_models():
    protein = Protein(refseq='NM_0001',
                      gene=Gene(name='SOMEGENE'),
                      sequence='ABCD')
    mutation = Mutation(protein=protein, position=1, alt='E')

    MC3Mutation(mutation=mutation,
                cancer=Cancer(code='CAN'),
                samples='Some sample')
    InheritedMutation(
        mutation=mutation,
        clin_data=[ClinicalData(disease=Disease(name='Some disease'))])

    protein_kinase = Protein(refseq='NM_0002',
                             gene=Gene(name='OTHERGENE'),
                             sequence='ABCD')
    kinase = Kinase(name='Kinase name', protein=protein_kinase)
    site = Site(protein=protein, position=1, residue='A', kinases=[kinase])
    protein.sites = [site]

    return locals()
Beispiel #29
0
def mutate(gene: Gene) -> Gene:
    items = [item for item in gene.value]
    possibilities = list(range(len(items))) + ['add']
    if len(items) >= 2:
        possibilities.append('remove')
    random_choice = random.choice(possibilities)
    if random_choice == 'add':
        items.append(random.choice(conf.GENE_ITEMS_POOL))
    elif random_choice == 'remove':
        items.pop(random.randint(0, len(items) - 1))
    else:
        items[random_choice] = random.choice(conf.GENE_ITEMS_POOL)
    return Gene(tuple(items))
Beispiel #30
0
    def test_show(self):

        p = Protein(refseq='NM_000123',
                    sequence='TRAN',
                    gene=Gene(name='TP53'))
        mutation = Mutation(protein=p, position=2, alt='K')

        db.session.add(mutation)

        response = self.client.get('/mutation/show/NM_000123/2/K')

        assert response.status_code == 200
        assert b'TP53' in response.data
        assert b'NM_000123' in response.data