def calculate_perturbation_factor(self, experiment, gene, pathway, visited=None): visited = [] if not visited else visited pf = 0 if len(set(gene.split(',')) & set(self.experiment_genes)) != 0: for name in gene.split(','): if name.strip() in self.experiment_genes: # get ΔE pf = self.FC['FC'][Gene(name)] if not isnan( self.FC['FC'][Gene(name)]) else MAX_IF break # genes directly upstream for edge in pathway.in_edges(gene): if edge[0] not in visited: beta = mean([ interaction_weights[t] if t in interaction_weights.keys() else 0 for t in get_edge_attributes(pathway, 'type')[edge] ]) # genes directly downstream dstream = len(pathway.out_edges(edge[0])) pf += self.calculate_perturbation_factor( experiment, edge[0], pathway, visited + [edge[1]]) * beta / dstream return pf
def test_file_with_description(test_files, tmpdir): create_files( tmpdir, { 'control_with_descriptions.tsv': ( 'Gene Description Control_1 Control_2', 'TP53 Tumour protein 53 6 6', 'BRCA2 Breast cancer type 2 s. protein 6 7', ) }) expected_warning = ( 'First line of your file contains "description" column ' 'but you did not provide "--description_column" argument.') # user forgot with pytest.warns(UserWarning, match=expected_warning): opts = p_parse('case t.tsv control control_with_descriptions.tsv') assert len(opts.control.sample_collection.samples) == 3 # user remembered opts = p_parse('case t.tsv control control_with_descriptions.tsv -d') assert len(opts.control.sample_collection.samples) == 2 assert set(opts.control.sample_collection.samples[0].genes) == { Gene('TP53', description='Tumour protein 53'), Gene('BRCA2', description='Breast cancer type 2 s. protein') }
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') protein.gene.preferred_isoform = protein MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Sample A,Sample B', count=2) InheritedMutation(mutation=mutation, clin_data=[ ClinicalData(disease=Disease(name='Some disease'), sig_code=5), ClinicalData(disease=Disease(name='Other disease'), sig_code=2) ]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases={kinase}, pmid={1, 2}, types={SiteType(name='glycosylation')}) protein.sites = [site] return locals()
def test_sample_init(): genes = {Gene('BAD'): 1.2345, Gene('FUCA2'): 6.5432} sample = Sample('Tumour_1', genes) assert sample.name == 'Tumour_1' assert all(isinstance(k, Gene) for k in sample.data.keys()) assert sample.data == genes
def test_mapping(self): gene_a = Gene( name='A', isoforms=[ # the full isoform of gene A Protein(refseq='NM_01', sequence='AAAAAAAAAXAA'), # a trimmed isoform of gene A Protein(refseq='NM_02', sequence='AAAXAA'), ]) gene_b = Gene(name='B', isoforms=[ Protein(refseq='NM_03', sequence='BBBBBBBBBYBB'), Protein(refseq='NM_04', sequence='BBBYBB'), ]) db.session.add_all([gene_a, gene_b]) # whoops, NM_03 has be accidentally removed (!) db.session.delete(Protein.query.filter_by(refseq='NM_03').one()) db.session.commit() mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'), lambda s: f'{s.position}{s.residue}') sites = DataFrame.from_dict(data={ 'good site A': ('A', 'NM_01', 10, 'AXA', 'X', 1), 'lost isoform': ('B', 'NM_03', 10, 'BYB', 'Y', 1) }, orient='index') sites.columns = [ 'gene', 'refseq', 'position', 'sequence', 'residue', 'left_sequence_offset' ] mapped_sites = mapper.map_sites_by_sequence(sites) sites_by_isoform = group_by_isoform(mapped_sites) # one from NM_01 (defined), from NM_02 (mapped), from NM_04 (mapped) assert len(mapped_sites) == 3 assert set(sites_by_isoform) == {'NM_01', 'NM_02', 'NM_04'} assert sites_by_isoform['NM_01'].residue == sites_by_isoform[ 'NM_02'].residue == 'X' assert sites_by_isoform['NM_01'].position == 10 assert sites_by_isoform['NM_02'].position == 4 assert sites_by_isoform['NM_04'].residue == 'Y' assert sites_by_isoform['NM_04'].position == 4 # will the mapping to NM_02 still work if we remove 'gene' column? sites.drop(columns=['gene'], inplace=True) mapped_sites = mapper.map_sites_by_sequence(sites) sites_by_isoform = group_by_isoform(mapped_sites) assert len(mapped_sites) == 2 assert set(sites_by_isoform) == {'NM_01', 'NM_02'}
def minimal_data(): tp53 = Gene('TP53') map2k1 = Gene('MAP2K1') case = SampleCollection('case', [Sample('1', {tp53: 2, map2k1: 1})]) control = SampleCollection('control', [Sample('1', {tp53: 1, map2k1: 1})]) return tp53, map2k1, case, control
def test_init(): genes1 = {Gene('BAD'): 1.2345, Gene('FUCA2'): 6.5432} genes2 = {Gene('BAD'): 2.3456, Gene('FUCA2'): 7.6543} samples = [Sample('Tumour_1', genes1), Sample('Tumour_2', genes2)] sample_collection = SampleCollection('Tumour', samples) assert sample_collection.name == 'Tumour' assert all(isinstance(k, Sample) for k in sample_collection.samples)
def test_train_model(self): phosphorylation = SiteType(name='phosphorylation') # non-phosphorylated serine residues are needed to generate negative sites p = Protein(refseq='NM_007', sequence='--------SLPA-----------SVIT-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) # phosphorylated, with sites p = Protein(refseq='NM_001', sequence='--------SPAK-----------SPAR-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) k = Kinase(name='CDK1', is_involved_in={phosphorylation}) for pos in [9, 24]: s = Site(position=pos, types={phosphorylation}, residue='S', protein=p, kinases={k}) db.session.add(s) db.session.commit() with TemporaryDirectory() as temp_dir: model = train_model(phosphorylation, sequences_dir=temp_dir, sampling_n=2, threshold=2) # the model should have one set of params - for CDK1 kinase assert len(model) == 1 cdk_params = model.rx2('CDK1') pwm = cdk_params.rx2('pwm') # and the position-specific weight matrix should be created assert pwm # the very detailed testing should be performed by rMIMP, # but why not test the basics? weights_of_central_aa = { aa: value for aa, value in zip(pwm.rownames, pwm.rx(True, 8)) } assert weights_of_central_aa['S'] == max( weights_of_central_aa.values())
def test_gene_init(): gene = Gene('BAD') assert gene.name == 'BAD' same = Gene('BAD') assert same == gene assert same is gene g = Gene('TP53', 'Tumour suppressor p53') assert g.name == 'TP53' assert g.description == 'Tumour suppressor p53' from copy import copy assert copy(g).id is g.id
def test_search_proteins(self): from views.search import search_proteins # create 15 genes and proteins mock_proteins_and_genes(15) # control: do we start with the mocked proteins not others? assert not search_proteins('TP53') # does respect limit? does symbol search work? results = search_proteins('Gene', 10) assert results assert len(results) == 10 assert results[0].name.startswith('Gene') # are results sorted? db.session.add_all([ Gene(name=name, preferred_isoform=Protein(refseq='NM_%s' % 20 * i)) for i, name in enumerate(['TPK', 'TPKK']) ]) results = search_proteins('TPK', 2) assert results[0].name == 'TPK' assert results[0].best_score < results[1].best_score # does include both: refseq and symbol search? assert search_proteins('NM_0003', 1) # can we change subset of searched features? assert not search_proteins('NM_0003', 1, features=['gene_symbol']) assert not search_proteins('Gene', 1, features=['refseq'])
def genes(): if request.method == "GET": genes = Gene.query.all() return render_template('display_genes.html', genes=genes) elif request.method == "POST": description = request.form.get("description") dna_sequence = request.form.get("dna_sequence") created_by = request.form.get("created_by") creation_date = request.form.get("creation_date") notes = request.form.get("notes") files = request.files.getlist('files[]') snapgene_files = "" for file in files: if file and allowed_file(file.filename): filename = secure_filename(file.filename) snapgene_files += filename + "," file.save( os.path.join(app.config['UPLOAD_FOLDER'], "plasmids", filename)) contains_genes = request.form.get("contains_genes") new_gene = Gene(description, dna_sequence, created_by, creation_date, notes, snapgene_files) db.session.add(new_gene) db.session.commit() flash("Success!") return redirect("/")
def create_test_data(): mappings_filename = make_named_gz_file(raw_mappings) # create proteins from first three data rows protein_data = [ ('NM_002749', 'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*' ), ('NM_139033', 'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*' ), ('NM_139034', 'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*' ), ] gene = Gene(name='MAPK7') proteins = { refseq_nm: Protein(refseq=refseq_nm, sequence=sequence, gene=gene) for refseq_nm, sequence in protein_data } # we need to have proteins with id in session - hence commit db.session.add(gene) db.session.commit() return mappings_filename, gene, proteins
def test_remove_unwanted_proteins(self): sequences = { 'NM_1': 'MAKS*', # all right 'NM_2': 'MKFR', # lack of stop codon 'NM_3': 'MK*A', # premature stop codon } proteins = create_test_proteins(['NM_1', 'NM_2', 'NM_3']) gene = Gene(isoforms=list(proteins.values()), preferred_isoform=proteins['NM_3']) db.session.add_all(proteins.values()) db.session.add(gene) for fake_refseq, protein in proteins.items(): protein.sequence = sequences[fake_refseq] clean_from_wrong_proteins() # NM_2 and NM_3 should be removed, NM_1 should still be there assert len(proteins) == 1 assert 'NM_1' in proteins for refseq in ['NM_2', 'NM_3']: assert refseq not in proteins # NM_1 should become a preferred isoform of gene (in place of NM_3) assert gene.preferred_refseq == 'NM_1'
def test_select_preferred_isoform(self): proteins_data = [ ('NM_001', 'MA', False), ('NM_002', 'MAA', True), ('NM_003', 'MAAA', True), # we want this one: # canonical according to uniprot, then longest, then oldest in refseq ('NM_004', 'MAAA', True), ('NM_005', 'MAAAA', False) ] preferred_refseq = 'NM_003' gene = Gene(name='Gene X') for refseq, seq, is_uniprot_canonical in proteins_data: protein = Protein(refseq=refseq, sequence=seq, gene=gene) if is_uniprot_canonical: protein_references = ProteinReferences( uniprot_entries=[UniprotEntry(isoform=1, reviewed=True)]) protein.external_references = protein_references db.session.add(gene) isoform = select_preferred_isoform(gene) assert isoform assert isoform.refseq == preferred_refseq
def test_edge_cases_mapping(self): gene_t = Gene( name='T', isoforms=[ # 123456789 Protein(refseq='NM_01', sequence='AXAXAYAYA'), # C-terminal part was trimmed Protein(refseq='NM_02', sequence='AXAXA'), # N-terminal part was trimmed Protein(refseq='NM_03', sequence='AYAYA'), ]) db.session.add(gene_t) db.session.commit() mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'), lambda s: f'{s.position}{s.residue}') # all sites in NM_01, the idea is to test sites = DataFrame.from_dict(data={ 'site at N-terminus edge': ('T', 'NM_01', 1, '^AX', 'A', 2), 'site at C-terminus edge': ('T', 'NM_01', 9, 'YA$', 'A', 2), }, orient='index') sites.columns = [ 'gene', 'refseq', 'position', 'sequence', 'residue', 'left_sequence_offset' ] mapped_sites = mapper.map_sites_by_sequence(sites) assert len(mapped_sites) == 4
def test_prepare_dataset(self): from views.mutation import prepare_datasets p = Protein(refseq='NM_000123', sequence='TRAN', gene=Gene(name='TP53')) mutation = Mutation(protein=p, position=2, alt='K') details = MC3Mutation(mutation=mutation, count=2) db.session.add(mutation) datasets, user_datasets = prepare_datasets(mutation) expected_datasets = [{ 'filter': 'Mutation.sources:in:' + source.name, 'name': source.display_name, 'mutation_present': False } if source is not MC3Mutation else { 'filter': 'Mutation.sources:in:' + MC3Mutation.name, 'name': MC3Mutation.display_name, 'mutation_present': [details] } for source in source_manager.confirmed] assert datasets == expected_datasets assert not user_datasets
def _set_genes(self, gene_ids): if gene_ids is None: return conn = Pgsql.Common.connect() cnt = 0 for gnid in gene_ids: # get sequence information #gene_seq_pep_min = GeneSequence(gnid=gnid, seq_type=self.seq_type, is_max_seq_len=False, conn=conn) gene_seq_pep_min = None gene_seq_pep_max = GeneSequence(gnid=gnid, seq_type=self.seq_type, is_max_seq_len=True, conn=conn, k=self.k) gene_seq_dna_max = None # set Gene instance for each gnid gene = Gene(gnid=gnid, pep_seq_min=gene_seq_pep_min, pep_seq_max=gene_seq_pep_max, dna_seq=gene_seq_dna_max) self.genes[gnid] = gene cnt += 1 # for test if cnt % 100 == 0: print('%s gene has been added.' % gnid) conn.close() # for single adding
def mock_proteins_and_genes(count): from database import db from models import Gene, Protein for i in range(count): g = Gene(name='Gene_%s' % i, full_name='Full name of gene %s' % i) p = Protein(refseq='NM_000%s' % i, gene=g) g.preferred_isoform = p db.session.add(g)
def test_domains(self): proteins = [ Protein( refseq='NM_018163', sequence= 'MAVTKELLQMDLYALLGIEEKAADKEVKKAYRQKALSCHPDKNPDNPRAAELFHQLSQALEVLTDAAARAAYDKVRKAKKQAAERTQKLDEKRKKVKLDLEARERQAQAQESEEEEESRSTRTLEQEIERLREEGSRQLEEQQRLIREQIRQERDQRLRGKAENTEGQGTPKLKLKWKCKKEDESKGGYSKDVLLRLLQKYGEVLNLVLSSKKPGTAVVEFATVKAAELAVQNEVGLVDNPLKISWLEGQPQDAVGRSHSGLSKGSVLSERDYESLVMMRMRQAAERQQLIARMQQEDQEGPPT*', gene=Gene(chrom='15')), Protein( refseq='NM_004671', sequence= 'MADFEELRNMVSSFRVSELQVLLGFAGRNKSGRKHDLLMRALHLLKSGCSPAVQIKIRELYRRRYPRTLEGLSDLSTIKSSVFSLDGGSSPVEPDLAVAGIHSLPSTSVTPHSPSSPVGSVLLQDTKPTFEMQQPSPPIPPVHPDVQLKNLPFYDVLDVLIKPTSLVQSSIQRFQEKFFIFALTPQQVREICISRDFLPGGRRDYTVQVQLRLCLAETSCPQEDNYPNSLCIKVNGKLFPLPGYAPPPKNGIEQKRPGRPLNITSLVRLSSAVPNQISISWASEIGKNYSMSVYLVRQLTSAMLLQRLKMKGIRNPDHSRALIKEKLTADPDSEIATTSLRVSLMCPLGKMRLTIPCRAVTCTHLQCFDAALYLQMNEKKPTWICPVCDKKAAYESLILDGLFMEILNDCSDVDEIKFQEDGSWCPMRPKKEAMKVSSQPCTKIESSSVLSKPCSVTVASEASKKKVDVIDLTIESSSDEEEDPPAKRKCIFMSETQSSPTKGVLMYQPSSVRVPSVTSVDPAAIPPSLTDYSVPFHHTPISSMSSDLPGLDFLSLIPVDPQYCPPMFLDSLTSPLTASSTSVTTTSSHESSTHVSSSSSRSETGVITSSGSNIPDIISLD*', gene=Gene(chrom='18')) ] db.session.add_all(proteins) filename = make_named_temp_file(domains_data) new_domains = load_domains(filename) assert len(new_domains) == 6 assert len(proteins[0].domains) == 2 domains = defaultdict(list) for domain in proteins[1].domains: domains[domain.interpro.short_description].append(domain) def assert_ranges(domain, start, end): assert domain.start == start and domain.end == end # two SAP domains should be merged for representation purposes due to similarity criteria # (these two domain annotation overlap so the smaller one is contained in the bigger) sap_domain = domains['SAP_dom'][0] assert_ranges(sap_domain, 1, 65) intepro_domain = sap_domain.interpro assert intepro_domain.accession == 'IPR003034' assert intepro_domain.description == 'SAP domain' # here the two annotations overlap with more than 75% of common assert_ranges(domains['PINIT'][0], 141, 299) # and here overlap was too small to merge those domains assert len(domains['Znf_MIZ']) == 2
def test_refseq(self): from search.gene import RefseqGeneSearch # create 15 genes and proteins mock_proteins_and_genes(10) search = RefseqGeneSearch().search # negative control for phase in ['9999', 'NM_00000', 'Gene']: assert not search(phase) # limiting results = search('NM_', limit=5) assert len(results) == 5 assert results[0].name.startswith('Gene') # test the search itself for refseq in ['NM_0003', 'nm_0003', '0003']: results = search(refseq) assert len(results) == 1 assert results[0].name == 'Gene_3' isoforms = results[0].matched_isoforms assert len(isoforms) == 1 assert isoforms.pop().refseq == 'NM_0003' db.session.add_all([ Gene(name='Gene X', isoforms=[ Protein(refseq='NM_000301'), Protein(refseq='NM_000302'), ]), Gene(name='Gene Y', isoforms=[Protein(refseq='NM_000309')]) ]) # so there are three genes with isoforms starting with NM_0003 # (those are Gene_3, Gene X, Gene Y). Let see if limiting work # well when applied per-gene. queries = {'NM_0003': 2, 'NM_00030': 2, 'NM_000301': 1, 'NM_000302': 1} for query, expected_result in queries.items(): assert len(search(query, limit=2)) == expected_result
def get_gene_chrom(conn, gene_id): """return chromsome which the gene is on """ query = select([gene]).where(gene.c.id == gene_id) results = conn.execute(query) result = next(results, None) if not result: return None return Gene(*result).chrom
def test_show(self): g = Gene(**test_gene_data) db.session.add(g) response = self.client.get('/gene/show/BRCA1') assert response.status_code == 200 assert b'BRCA1' in response.data assert b'NM_000123' in response.data
def gene_from_isoforms(all_proteins, chosen_isoforms): """Just for testing: in normal settings the bi-directional initialization is performed required""" isoforms = [ protein for refseq, protein in all_proteins.items() if refseq in chosen_isoforms ] gene = Gene(isoforms=isoforms) for isoform in isoforms: isoform.gene = gene return gene
def test_gene_full_name(self): gene = Gene(name='TP53', entrez_id=7157) db.session.add(gene) filename = make_named_temp_file(full_gene_names, mode='wt', opener=gzip.open) load_full_gene_names(filename) assert gene.full_name == 'tumor protein p53'
def get_generator(latent_dim, input_dim, ngf): # Initialise generator G = Gene(latent_dim, output_dim = input_dim, ngf = ngf) G.apply(weights_init) G = G.to(device) return G
def create_test_kinase(name, refseq): interactor = Kinase(name=name) kinase_gene = Gene(name='Gene of ' + interactor.name) kinase_protein = Protein(refseq=refseq, gene=kinase_gene) interactor.protein = kinase_protein return interactor
def test_conservation(self): proteins = create_test_proteins(['NM_002749', 'NM_000600']) proteins['NM_002749'].gene = Gene(name='MAPK7', chrom='17') proteins['NM_000600'].gene = Gene(name='IL6', chrom='7') for refseq, protein in proteins.items(): protein.sequence = sequences_for_proteins[refseq] # Generated with: """ import pyBigWig test_data = test_data.loc[('chr17', 'NM_002749')].iloc[0] full_bw = pyBigWig.open('data/hg19.100way.phyloP100way.bw') test_bw = pyBigWig.open('chr17_NM_002749.test_data.bw', 'w') needed_chrom_length = max(max(test_data.exonStarts), max(test_data.exonEnds)) test_bw.addHeader([('chr17', needed_chrom_length)]) coordinates = sorted(zip(test_data.exonStarts, test_data.exonEnds), key=lambda x: x[0]) for start, end in zip(test_data.exonStarts, test_data.exonEnds): values = full_bw.values('chr17', start, end) for i, value in enumerate(values): test_bw.addEntries(['chr17'], starts=[start + i], ends=[start + i + 1], values=[value]) test_bw.close() """ conservation_big_wig = 'tests/test_imports/chr17_NM_002749.test_data.bw' gene_coordinates = make_named_temp_file(coordinates_data) load_conservation(conservation_big_wig, gene_coordinates) # Protein.query.filter_by(refseq='NM_002749').one().conservation assert proteins[ 'NM_002749'].conservation == '3.47;1.67;2.95;1.23;.9;1.64;4.08;1.72;1.25;1.15;2.26;1.69;1.03;1.05;2.39;1.52;2.88;.5;2.28;-.09;2.24;1.32;-.06;.59;1.2;-.13;.37;.76;.04;1.26;-.2;1.84;.97;2.95;5.67;3.98;2.91;4.5;2.33;3.17;4.66;3.73;4.24;4.01;2.49;4.35;5.29;2.95;4.67;5.66;5.36;5.25;4.23;4.87;5.31;5.18;3.52;4.53;5.59;3.55;5.05;6.65;2.01;5.75;5.06;4.88;5.9;4.9;5.63;3.32;3.52;5.09;4.04;4.24;2.24;1.84;2.13;6.12;6.54;2.75;6.08;5.79;5.7;8.26;6.81;5.82;3.83;4.47;3.08;5.11;5;3.4;2.52;4.67;3.97;3.58;4.87;3.72;5.33;4.48;3.92;6.79;3.7;7.52;5.46;4.05;3.07;4.02;5.47;5.31;5.12;7.11;6.59;6.09;5.37;4.73;5.6;6.3;6.52;5.96;3.32;2.84;4.06;.53;3.5;3.79;3.11;1.81;4.51;4.81;3.35;4.29;5.34;5.31;4.63;6.41;2.39;6.46;3.16;8.2;6.23;2.67;6.63;2.39;4.07;5;3.49;4.71;4.16;3.27;.7;6.43;1.1;3.25;2.31;1.57;6.39;5.62;2.68;4.33;6.16;6;3.28;6.16;6.66;3.66;3.77;4.11;6.52;3.69;6.55;5.97;5.28;5.9;3.63;6.16;5.36;5.92;4.24;6.23;4.27;7.07;5.64;6.62;5.32;1.79;6.59;3.26;4.38;3.92;6.2;3.59;5.22;5.76;6.77;4.58;7.52;4.24;3.91;5.75;5.28;6.51;7.43;5.49;2.98;5.48;2.01;2.62;2.19;1.95;1.01;2.03;5.55;1.47;2.2;2.38;5.52;5.93;3.39;6.62;5.51;6.67;5.67;3.9;6.03;8.99;6.67;4.17;4.7;4.01;6.27;3.7;5.06;3.66;2.06;2.64;3.76;2.56;5.94;3.82;2.89;4.05;5.17;6.75;3.05;8.99;4.25;5.57;6.66;6.22;6.06;3.65;4.13;6.66;8.76;2.77;2.34;1.65;1.84;3.91;3.38;5.3;3.49;6.22;3.21;3.91;5.43;2.8;4.28;6.44;3.25;3.89;2.56;5.75;4.58;1.02;5.09;3.53;5.75;5;3.97;3.24;2.08;.17;3.51;2.96;1.66;2.99;3.71;4.75;3.78;5.8;3.71;5.3;4.16;3.18;4.66;3.56;6.62;4.69;2.96;4.27;3.1;3.49;2.38;2.54;3.18;4.54;5.17;3.34;.46;4.2;3.77;2.53;1.27;3.6;4.46;.83;1.58;4.57;2.92;2.17;2.81;3.17;1.48;1.13;8.45;2.85;1.15;4.09;5.67;3.85;.82;1.15;3.3;3.95;.21;4.67;1.19;2.83;5.43;3.54;.76;5.35;3.81;3.85;3.23;2.46;5.65;5.76;3.97;6.32;3.35;2.86;5.95;6.63;4.5;2.89;5.45;3.95;4.59;4.34;5.09;4.99;5.24;4.02;4.91;5.83;1.59;2.45;1.71;2.51;2.87;2.93;6;3.07;4.58;5.09;5.26;4.44;4.06;3.28;2.1;6.69;4.55;2.58;5.57;4.79;3.75;.58;1.99;2.45;4.16;3.64;1.55;3.8;2.09;3.71;4.36;2.27;4.31;4.26;3.43;1.01;.44;1.57;1.7;2.68;1.46;1.5;1.7;.7;.92;1.94;1.42;2.39;.76;3.24;3.14;1.01;2.85;1.17;1.48;1.22;.67;.8;.75;3.02;1.12;1.34;4.82;3.73;1.26;.92;-.86;1.13;.65;1.18;.06;-.67;.8;.2;.41;.35;.05;.69;1.11;1.09;.06;1.29;2.34;.9;-.06;.36;1.24;-.18;.06;-.14;.87;.86;2.37;1.38;.52;1.97;.97;4.2;1.48;4.38;5.02;3.81;4.14;3.29;4.98;4.21;4.73;5.01;2.84;4.03;4.01;4.89;4.64;2.37;1.66;2.52;5.89;3.6;1.53;1.1;2.19;1.25;1;2.33;2.53;1.89;2.01;1;.84;-.21;.93;2.4;2.06;1.91;4.48;.3;3.03;2.21;2.16;4.76;4.08;4.42;4.07;5.72;3.13;3.52;2.69;5.1;3.98;3.99;5.24;5.81;2.02;1.75;3.18;2.96;4.36;4.26;3.2;3.71;1.85;4.88;2.93;4.94;4.22;2.17;1.67;2.44;4.68;1.22;3.37;1.68;1.73;2.69;1.65;.75;-.94;1.27;.11;-.79;3;1.26;.57;1.71;.45;2.93;1.48;1.67;2.62;3.54;.95;3.03;2.33;2.5;3.92;2.41;3.3;2.08;2.16;.87;1.26;3.18;3.02;4.04;1.48;.78;4.1;2.06;.59;1.89;.25;.44;-.23;-.57;-.45;.33;-.12;.22;-1.71;.1;.63;.58;.04;-.21;.5;-2.18;.06;.06;-.46;-.39;-.23;.26;.36;.42;.86;1.85;.56;.27;-.51;.65;.42;-.35;.43;.93;.05;-.66;.28;.09;.41;-.05;.46;.51;.02;.67;.11;-.55;.27;.09;.77;-.18;.11;2.34;.99;.1;.78;.52;1.23;.3;.04;.31;-.42;.13;.78;1.01;-.48;.98;.14;.65;.25;.97;-.67;.8;.49;1.15;.15;-1.05;.52;1.43;.04;.68;-.14;-.15;.24;.73;1.71;.89;-.05;.68;.53;.1;.52;.81;1.56;.28;.07;2.18;.55;1.24;2.33;.78;-.04;.77;-.09;.4;.61;.96;1.59;1.57;1.71;1.52;1.41;1.11;1.48;.52;1.61;.76;2.41;.05;.73;-.38;.4;1.78;.95;.46;2.29;1.05;1.94;1.74;.88;1.32;1.97;1.66;1.64;1.17;2.12;2.38;2.63;2.64;1.63;2.49;3.23;1.49;4.21;5.62;5.72;5.3;.78;3.01;4.23;3.16;3.53;5.02;3.73;4.38;4.53;4.4;6.09;6.37;5.1;5.65;4.64;5.8;4.94;5.24;4.63;6.08;4.74;4.27;4.1;5.34;5.96;3.64;2.03;3.29;5.31;2.6;2.34;2.25;4.26;2.1;2.08;2.48;.6;1.53;.31;1.96;4.53;3.8;3.27;.67;5.5;2.33;4.89;2.31;3.4;4.37;3.71;4.24;2.89;3.27;2.97;5.14;7.61;2.14;5.73;3.45;4.27;3.13;5.77;4.91;2.21;2.5;4.74;3.57;4.6;3.43;3.41;6.01;3.21;5.22;3.11;6.33;4.8;4.29;5.23;3.28;6.47;2.48;6.01;3.17;5.92;1.38;1.79;2.96;2.67;2.21;2.67;1.89' # no data for this one, lets see if the pipeline handles such cases well assert proteins['NM_000600'].conservation is None db.session.add_all(proteins.values()) db.session.commit() assert Protein.query.filter_by( refseq='NM_000600').one().conservation == ''
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Some sample') InheritedMutation( mutation=mutation, clin_data=[ClinicalData(disease=Disease(name='Some disease'))]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases=[kinase]) protein.sites = [site] return locals()
def mutate(gene: Gene) -> Gene: items = [item for item in gene.value] possibilities = list(range(len(items))) + ['add'] if len(items) >= 2: possibilities.append('remove') random_choice = random.choice(possibilities) if random_choice == 'add': items.append(random.choice(conf.GENE_ITEMS_POOL)) elif random_choice == 'remove': items.pop(random.randint(0, len(items) - 1)) else: items[random_choice] = random.choice(conf.GENE_ITEMS_POOL) return Gene(tuple(items))
def test_show(self): p = Protein(refseq='NM_000123', sequence='TRAN', gene=Gene(name='TP53')) mutation = Mutation(protein=p, position=2, alt='K') db.session.add(mutation) response = self.client.get('/mutation/show/NM_000123/2/K') assert response.status_code == 200 assert b'TP53' in response.data assert b'NM_000123' in response.data