def parseCSVFile(filename): user = User.query.filter_by(email = session['email']).first() uid = user.get_id() dbfile = File.query.filter_by(filename=filename).filter_by(user_id=uid).filter_by(location=app.config['DATA_FOLDER']).first() # Read in data and commit to db fullfilename = os.path.join(app.config['DATA_FOLDER'],filename) with open(fullfilename, 'rb') as f: for line in csv.DictReader(f, delimiter=','): name = line['name'] replicate = line['replicate'] condition = line['condition'] data = line['data'] location = app.config['DATA_FOLDER'] dbgene = Gene.query.filter_by(descr=name).filter_by(file_id=dbfile.get_id()).first() if not dbgene: dbgene = Gene(descr=name,file_id=dbfile.get_id()) db.session.add(dbgene) db.session.commit() dbcond = Condition.query.filter_by(condition=condition,value=data, gene_id=dbgene.get_id(), replicate=replicate).first() if not dbcond: dbcond = Condition(condition=condition,value=data, gene_id=dbgene.get_id(), replicate=replicate) db.session.add(dbcond) db.session.commit() #db.session.commit() f.close() # Now can set loaded to true dbfile.set_loaded()
class searchRelationTestCase(TestCase): def setUp(self): self.gene = Gene() self.gene.gene_id = 'test' self.gene.name = 'test_gene' self.gene.ntseq_length = 0 self.gene.save() self.paper_gene = Paper_Gene() self.paper_gene.paper_id = 'paper_id' self.paper_gene.paper_title = 'title' self.paper_gene.paper_link = 'www.test.com' self.paper_gene.gene = self.gene self.paper_gene.paper_keyword = '' self.paper_gene.save() def test_search_relation(self): search_result = search_relation('none') self.assertEqual(search_result, None) def test_search_genes(self): search_result = search_genes('none') self.assertEqual(search_result, None) def test_search_papers(self): search_result = search_papers('test') self.assertEqual(search_result, []) def test_search_one_sentence(self): search_result = search_one_sentence('name1', 'name2') self.assertEqual(search_result, []) def test_search_three_sentence(self): search_result = search_three_sentence('name1', 'name2') self.assertEqual(search_result, []) def test_search_related_disease(self): search_result = search_related_disease('name1') if search_result: return True
def setUp(self): self.gene = Gene() self.gene.gene_id = 'test' self.gene.name = 'test_gene' self.gene.ntseq_length = 0 self.gene.save() self.paper_gene = Paper_Gene() self.paper_gene.paper_id = 'paper_id' self.paper_gene.paper_title = 'title' self.paper_gene.paper_link = 'www.test.com' self.paper_gene.gene = self.gene self.paper_gene.paper_keyword = '' self.paper_gene.save()
def test_annotation(self): annotation = Annotation("COG0001") self.session.add(annotation) self.session.commit() assert Annotation.query.first() is annotation #Test the many to many relationship reference_assembly = ReferenceAssembly("version 1") gene = Gene("gene1", reference_assembly) gene2 = Gene("gene2", reference_assembly) gene3 = Gene("gene3", reference_assembly) annotation2 = Annotation("COG0002", description="This cog is really really good") # Test having multiple genes to one annotation annotation_source = AnnotationSource("Cog", "v1.0", "rpsblast", "e_value=0.000001") gene_annotation1 = GeneAnnotation(annotation_source = annotation_source, e_value=0.0000001) gene_annotation2 = GeneAnnotation(annotation_source = annotation_source) gene_annotation1.gene = gene gene_annotation2.gene = gene2 gene_annotation1.annotation = annotation gene_annotation2.annotation = annotation self.session.add(annotation) self.session.add(gene3) self.session.add(gene_annotation1) self.session.add(gene_annotation2) self.session.commit() annotation_01 = Annotation.query.filter_by(type_identifier="COG0001").first() assert len(annotation_01.genes) == 2 assert gene in annotation_01.genes assert gene2 in annotation_01.genes assert annotation in Gene.query.filter_by(name="gene1").first().annotations assert annotation in Gene.query.filter_by(name="gene2").first().annotations assert len(Gene.query.filter_by(name="gene3").first().annotations) == 0 # Genes for annotation method genes_for_annotation = Annotation.genes_per_annotation([annotation.id]) assert len(genes_for_annotation) == 2 assert (gene, annotation) in genes_for_annotation assert (gene2, annotation) in genes_for_annotation # Add the second annotation self.session.add(annotation2) self.session.commit() q = Annotation.query.filter(Annotation.description.contains("good")) annotation_02 = q.all() assert len(annotation_02) == 1 assert annotation_02[0] == annotation2 # Test having multiple annotations to one gene gene_annotation3 = GeneAnnotation(annotation2, gene, annotation_source, e_value = 1e-14) self.session.add(gene_annotation3) self.session.commit() assert len(Gene.query.filter_by(name="gene1").first().annotations) == 2 assert annotation in Gene.query.filter_by(name="gene1").first().annotations assert annotation2 in Gene.query.filter_by(name="gene1").first().annotations assert gene_annotation1.e_value > gene_annotation3.e_value assert gene.e_value_for(annotation) > gene.e_value_for(annotation2) # gene -> annotation # gene2 -> annotation # gene -> annotation2 # Genes for annotation method genes_for_annotation = Annotation.genes_per_annotation([annotation.id]) assert len(genes_for_annotation) == 2 assert (gene, annotation) in genes_for_annotation assert (gene2, annotation) in genes_for_annotation genes_for_annotation = Annotation.genes_per_annotation([annotation2.id]) assert len(genes_for_annotation) == 1 assert (gene, annotation2) in genes_for_annotation genes_for_annotation = Annotation.genes_per_annotation([annotation.id, annotation2.id]) assert len(genes_for_annotation) == 3 assert (gene, annotation) in genes_for_annotation assert (gene, annotation2) in genes_for_annotation assert (gene2, annotation) in genes_for_annotation annotation3 = Annotation("COG0003", description=("This cog is really really good. I assure you, " "really quite good. Among its capabilities I have to mention that its utterly suitable for " "testing the description string, including the short description.")) assert len(annotation3.description) > 103 assert annotation3.short_description[-3:] == "..." assert len(annotation3.short_description) == 103 assert annotation3.description[:100] == annotation3.short_description[:100]
def test_taxon(self): ref_assembly = ReferenceAssembly("Version 1") gene1 = Gene("gene1", ref_assembly) sample1 = Sample("P1993_101", None, None) reference_assembly = ReferenceAssembly("version 1") gene_count1 = GeneCount(gene1, sample1, 0.001) taxon1 = Taxon(superkingdom="Bacteria", phylum="Proteobacteria") gene1.taxon = taxon1 self.session.add(gene1) self.session.add(taxon1) self.session.add(sample1) self.session.add(gene_count1) self.session.commit() gene1 = Gene.query.first() taxon1 = Taxon.query.first() assert gene1.taxon == taxon1 assert gene1 in taxon1.genes assert taxon1.superkingdom == 'Bacteria' assert taxon1.phylum == 'Proteobacteria' assert taxon1.taxclass == '' assert taxon1.full_taxonomy == 'Bacteria;Proteobacteria;;;;;;' refresh_all_mat_views() # Test sample count retreival sample2 = Sample("P1993_102", None, None) self.session.add(sample2) self.session.commit() refresh_all_mat_views() assert taxon1.rpkm == {sample1: 0.001} gene_count2 = GeneCount(gene1, sample2, 0.2) self.session.add(gene_count2) self.session.commit() refresh_all_mat_views() assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2} gene2 = Gene("gene2", ref_assembly) gene_count3 = GeneCount(gene2, sample2, 0.1) self.session.add(gene2) self.session.add(gene_count3) self.session.commit() refresh_all_mat_views() # taxon1.rpkm should still be the same since the new gene is not connected to taxon1 assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2} taxon2 = Taxon(superkingdom="Eukaryota", phylum="Chlorophyta") gene2.taxon = taxon2 self.session.add(taxon2) self.session.add(gene2) self.session.commit() refresh_all_mat_views() # Taxon2 should have gene_count3 stats only assert taxon2.rpkm == {sample2: 0.1} gene3 = Gene("gene3", ref_assembly, taxon_id=taxon1.id) gene_count4 = GeneCount(gene3, sample1, 1.0) self.session.add(gene3) self.session.add(gene_count4) self.session.commit() # Taxon1 should now have the original stats plus gene_count4 assert taxon1.rpkm == {sample1: 1.001, sample2: 0.2} taxon3 = Taxon(superkingdom="Eukaryota", phylum="Unnamed", taxclass="Dinophyceae") self.session.add(taxon3) self.session.commit() gene4 = Gene("gene4", ref_assembly, taxon_id=taxon3.id) gene_count5 = GeneCount(gene4, sample2, 0.003) self.session.add(gene4) self.session.add(gene_count5) self.session.commit() refresh_all_mat_views() # theoretical rpkm_table: # samples = [sample1, sample2] # rpkm_table = {"Bacteria": {"P1993_101": 1.001, "P1993_102": 0.2}, "Eukaryota": {"P1993_102": 0.103}} samples, rpkm_table, complete_val_to_val = Taxon.rpkm_table() assert samples == [sample1, sample2] assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Bacteria", "Eukaryota"] # Sorted by summed rpkm assert rpkm_table[("Bacteria")] == {sample1: 1.001, sample2: 0.2} assert rpkm_table[("Eukaryota")] == {sample2: 0.103} samples, rpkm_table, complete_val_to_val= Taxon.rpkm_table(level='phylum') assert samples == [sample1, sample2] assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Proteobacteria", "Chlorophyta", "Unnamed"] # Sorted by summed rpkm assert rpkm_table[("Bacteria;Proteobacteria")] == {sample1: 1.001, sample2: 0.2} assert rpkm_table[("Eukaryota;Chlorophyta")] == {sample2: 0.1} assert rpkm_table[("Eukaryota;Unnamed")] == {sample2: 0.003}
def test_autocomplete_all(self): # MC3 GeneList is required as a target (a href for links) where users will be pointed # after clicking of cancer autocomplete suggestion. Likewise with the ClinVar list. db.session.add_all([ GeneList(name=name, mutation_source_name=detail_class.name) for name, detail_class in [ ('TCGA', MC3Mutation), ('ClinVar', InheritedMutation) ] ]) g = Gene(name='BR') p = Protein(id=1, refseq='NM_007', gene=g, sequence='XXXXXV') g.preferred_isoform = p # required for gene search to work - genes without preferred isoforms are ignored mut = Mutation(protein=p, position=6, alt='E') db.session.add_all([mut, p, g]) def autocomplete(query): r = self.client.get('/search/autocomplete_all/?q=' + query) self.visit_returned_urls(r) return r from database import bdb_refseq, bdb bdb_refseq['BR V6E'] = [p.id] # required for mutation search bdb.add_genomic_mut('1', 10000, 'T', 'C', mut) # Gene and mutations response = autocomplete('BR V6E') entry = get_entry_and_check_type(response, 'aminoacid mutation') assert entry response = autocomplete('BR V6') entry = get_entry_and_check_type(response, 'message') assert 'Awaiting for <code>{alt}</code>' in entry['name'] response = autocomplete('BR V') entry = get_entry_and_check_type(response, 'message') assert 'Awaiting for <code>{pos}{alt}</code>' in entry['name'] response = autocomplete('B') entry = get_entry_and_check_type(response, 'gene') assert 'BR' == entry['name'] # genomic mutation response = autocomplete('chr1 10000 T C') entry = get_entry_and_check_type(response, 'nucleotide mutation') assert entry and entry['input'] == 'CHR1 10000 T C' # is the search falling back to the other strand? response = autocomplete('chr1 10000 A G') entry = get_entry_and_check_type(response, 'nucleotide mutation') assert entry and entry['input'] == 'complement of CHR1 10000 A G' prompt = 'Awaiting for mutation in <code>{chrom} {pos} {ref} {alt}</code> format' for prompt_invoking_query in ['chr1', 'chr1 ', 'chr1 40', 'chr1 40 ', 'chr1 40 T']: response = autocomplete(prompt_invoking_query) entry = get_entry_and_check_type(response, 'message') assert entry['name'] == prompt # Pathways pathways = [ Pathway(description='Activation of RAS in B cells', reactome=1169092), Pathway(description='abortive mitotic cell cycle', gene_ontology=33277), Pathway(description='amacrine cell differentiation', gene_ontology=35881), Pathway(description='amniotic stem cell differentiation', gene_ontology=97086) ] db.session.add_all(pathways) # test partial matching and Reactome id pathways search for ras_activation_query in ['Activation', 'REAC:1', 'REAC:1169092']: response = autocomplete(ras_activation_query) entry = get_entry_and_check_type(response, 'pathway') assert entry['name'].startswith('Activation of RAS in B cells') # test Gene Ontology search: response = autocomplete('GO:33') go_pathway = get_entry_and_check_type(response, 'pathway') assert go_pathway['name'] == 'abortive mitotic cell cycle (GO:33277)' # check if multiple pathways are returned response = autocomplete('differentiation') assert len(response.json['entries']) == 2 # check if both genes an pathways are returned simultaneously # there should be: a pathway ('a>b<ortive...') and the >B<R gene response = autocomplete('b') entries = response.json['entries'] names = [entry['name'] for entry in entries] assert all([name in names for name in ['BR', 'abortive mitotic cell cycle']]) # check if "search more pathways" is displayed response = autocomplete('cell') # cell occurs in all four of added pathways; # as a limit of pathways shown is 3, we should get a "show more" link links = entries_with_type(response, 'see_more') assert len(links) == 1 assert links[0]['name'] == 'Show all pathways matching <i>cell</i>' # test case insensitive text search response = autocomplete('AMNIOTIC STEM') pathways = entries_with_type(response, 'pathway') assert len(pathways) == 1 assert pathways[0]['name'] == 'amniotic stem cell differentiation' # Disease disease_names = [ 'Cystic fibrosis', 'Polycystic kidney disease 2', 'Frontotemporal dementia', 'Cataract, nuclear total' ] diseases = {name: Disease(name=name) for name in disease_names} db.session.add_all(diseases.values()) response = autocomplete('cystic') cystic_matching = entries_with_type(response, 'disease') # both 'Cystic fibrosis' and PKD2 should match assert len(cystic_matching) == 2 # is comma containing disease name properly linked? response = autocomplete('Cataract') cataract = get_entry_and_check_type(response, 'disease') assert cataract['name'] == 'Cataract, nuclear total' # Gene mutation in disease # test suggestions response = autocomplete('cystic ') entry = entries_with_type(response, 'message')[0] assert re.match('Do you wish to search for (.*?) mutations\?', entry['name']) # currently there are no mutations associated with any disease # so the auto-completion should not return any results response = autocomplete('cystic in ') assert not response.json['entries'] # let's add a mutation m = Mutation(protein=p, position=1, alt='Y') bdb_refseq['BR X1Y'] = ['NM_007'] # note: sig_code is required here data = ClinicalData(disease=diseases['Cystic fibrosis'], sig_code=1) disease_mutation = InheritedMutation(mutation=m, clin_data=[data]) db.session.add_all([m, data, disease_mutation]) # should return '.. in BR' suggestion now. for query in ['cystic in', 'cystic in ']: response = autocomplete(query) result = get_entry_and_check_type(response, 'disease_in_protein') assert result['gene'] == 'BR' assert result['name'] == 'Cystic fibrosis' # both gene search and refseq search should yield the same, non-empty results results = [] for query in ['cystic in BR', 'cystic in NM_007', 'cystic in 007']: response = autocomplete(query) result = get_entry_and_check_type(response, 'disease_in_protein') results.append(result) assert all(r == result for r in results) and result
def create_test_protein(): g_x = Gene(name='Gene X') p = Protein(refseq='NM_0007', gene=g_x, sequence='TRAN') return p
def execute_gene(self, feature_rows, strain_id): features = {} sequence = None transcript = None gene_id = None min_start = None max_end = None for feature_row in feature_rows: # Loop through annotation rows in the gff file, all related to the current gene # keep track of start and end start = feature_row[3] end = feature_row[4] direction = "forward" if feature_row[6] == "+" else "reverse" chromosome_id = feature_row[0] feature_type = feature_row[2] attribs = feature_row[8].strip() # This causes bugs. # if feature_type == "gene": # Handle gene entries # gene_id = attribs.split(";")[0].split(":")[1] # grab the gene ID - we'll want this for later new_gene_id = self.find_attribs_value("ID=Gene", attribs) if new_gene_id != None: # only deal with proper genes. setting gene_id to None means nothing else will be processed. # so it will essentially skip non-"gene" entries. if feature_type != "gene": gene_id = None continue # Check against filter list if there is one if self.filter_genes != None and new_gene_id not in self.filter_genes: # filter list exists, and gene is not in filter list # skip this gene return gene_id = new_gene_id # add the Gene entry - if it hasn't been already if gene_id not in self.genes_seen: gene = Gene(gene_id) self.genes_to_write.append(gene) self.genes_seen[gene_id] = gene elif gene_id != None : # Handle transcript entries - if the gene is legit transcript_id = self.find_attribs_value("ID=Transcript", attribs) if transcript_id != None: # it's a transcript entry # add the Transcript entry - if it hasn't been already transcript_id = self.ensure_unique_transcript_id(transcript_id) if transcript_id not in self.transcripts_seen: transcript = Transcript( id=transcript_id, gene_id=gene_id ) self.transcripts_to_write.append(transcript) self.transcripts_seen[transcript.id] = transcript else: # Handle transcript feature entries # for some reason, features for a given strain/transcript # combination are not always added transcript_id = self.find_attribs_value("Parent=Transcript", attribs) if transcript_id != None: # it's a transcript feature entry # put a filter here? some elements are not worth storing? self.features_to_write.append(Feature( transcript_id=transcript_id, type_id=feature_row[2], strain_id=strain_id, chromosome_id=chromosome_id, start=start, end=end, direction=direction )) else: pass # this happens for pseudogenes and TEs - which we aint interested in
def test_gene_set(): gene_set = GeneSet('set', ['MDM2', 'TP53']) assert Gene('TP53') in gene_set assert str(gene_set) == '<GeneSet: set with 2 genes>'
def iterate_gtex_vs_spidex(strict=False, tissues=GTEX_TISSUES, location=None, filters=None, no_further_than_x_from_gene=None): """ Yield records representing GTEx-SPIDEX pairs which are matching (the same position, reference and alternative alleles). Args: no_further_than_x_from_gene: if 0, accept only mutations acting on their gene strict: should critical data discrepancies raise errors or be collected as statistics? tissues: list of tissues to be used location: 'intronic' or 'exonic' - filters SPIDEX records filters: dict - filtering criteria for GTEx records. In spidex there are only SNPs (single!) Definitions for GTEx (from http://www.gtexportal.org/home/documentationPage): The effect size of the eQTLs is defined as the slope of the linear regression, and is computed as the effect of the alternative allele (ALT) relative to the reference allele (REF) in the human genome reference GRCh37/hg19 (i.e., the eQTL effect allele is the ALT allele). Definitions for SPIDEX (more in spidex/README): dpsi_max_tissue: The delta PSI. This is the predicted change in percent-inclusion due to the variant, reported as the maximum across tissues (in percent). dpsi_zscore: This is the z-score of dpsi_max_tissue relative to the distribution of dPSI that are due to common SNP. ref_allele: The reference allele at the variant position (forward-strand) mut_allele: The mutant allele at the variant position (forward-strand) """ # Use "Brain cortex" for basic tests - it's very small # tissues = ['Brain_Cortex'] # Use "Adipose Subcutaneous" for larger tests # tissues = ['Adipose_Subcutaneous'] path = create_path_for_genes_db(tissues) genes = ExpressedGenes(path) genes.reset() import_expressed_genes(genes, tissues=tissues) tb = tabix.open(SPIDEX_LOCATION) count = count_all(tissues) counter = Counter() for mutation_code, tissue, slope, ensembl_gene_id in tqdm( iterate_over_expression(tissues), total=count): chrom, pos, ref, alt, _ = mutation_code.split('_') # In spidex there are only SNPs (single!) if len(ref) != 1 or len(alt) != 1: counter['not_single'] += 1 continue pos = int(pos) gene = Gene(*genes[ensembl_gene_id]) if not gene: print('gene %s not present in data' % ensembl_gene_id) continue if no_further_than_x_from_gene is not None: if not (gene.start - no_further_than_x_from_gene <= pos <= gene.end + no_further_than_x_from_gene): counter['not_within_requested_gene_span'] += 1 continue variant = SingleAltVariant(chr_name=chrom, chr_start=pos, chr_end=pos, chr_strand=gene.strand, snp_id='-', ref=convert_to_strand(ref, gene.strand), alt=convert_to_strand(alt, gene.strand), gene=ensembl_gene_id) records = spidex_get_variant(tb, variant) if filters: records = [ record for record in records if all( getattr(record, key) == value for key, value in filters.items()) ] record = None try: # if genes are the same there is no need to test strands, but its better to double check record = choose_record(records, variant, variant.alt, strict=strict, test_strand=True, location=location) except StrandMismatch: counter['strand_mismatch'] += 1 except Intronic: counter['intronic'] += 1 except ToManyRecords: counter['to_many_records'] += 1 if record: if gene.name != record.gene: counter['gene_name_mismatch'] += 1 continue variant.refseq_transcript = record.transcript yield variant, tissue, slope, record, gene else: counter['Not found in SPIDEX'] += 1 if strict: print(counter)
def test_protein_references(self): uniprot_filename = make_named_temp_file(data=idmapping_dat, opener=gzip.open, mode='wt') reflink_filename = make_named_temp_file(data=reflink_data, opener=gzip.open, mode='wt', suffix='.gz') refseq_filename = make_named_temp_file(data=refseq_data) refseqs = [ 'NM_011739', # present in reference mappings 'NM_001131572', # present 'NM_201200', # present 'NM_0001' # not present in reference mappings ] g = Gene(name='Some gene') proteins_we_have = { refseq_nm: Protein(refseq=refseq_nm, gene=g) for refseq_nm in refseqs } tp53 = Gene(name='TP53') tp53_protein = Protein(refseq='NM_000546', gene=tp53) with self.app.app_context(): # let's pretend that we already have some proteins in our db db.session.add_all(proteins_we_have.values()) db.session.add(tp53_protein) references = load_external_references(uniprot_filename, refseq_filename, reflink_filename) # there are 3 references we would like to have extracted assert len(references) == 3 protein = proteins_we_have['NM_011739'] assert len(protein.external_references.uniprot_entries) == 2 uniprot_entry = protein.external_references.uniprot_entries[1] assert uniprot_entry.accession == 'P68254' assert uniprot_entry.isoform == 1 assert uniprot_entry.reviewed is True uniprot_entry = protein.external_references.uniprot_entries[0] assert uniprot_entry.reviewed is False ensembl_peptides = protein.external_references.ensembl_peptides assert len(ensembl_peptides) == 2 assert (set(ensembl.peptide_id for ensembl in ensembl_peptides) == { 'ENSMUSP00000106602', 'ENSMUSP00000100067' }) protein = proteins_we_have['NM_001131572'] assert len(protein.external_references.uniprot_entries) == 1 uniprot_entry = protein.external_references.uniprot_entries[0] assert uniprot_entry.accession == 'Q5RFJ2' assert uniprot_entry.isoform == 1 assert uniprot_entry.reviewed is False # check if protein without references stays clear protein = proteins_we_have['NM_0001'] # it's needed to re-add the protein cause ORM will emit a query # (just in case, that's how flask-testing works - any object needs # to be re-added to session after its termination) db.session.add(protein) assert protein.external_references is None # check the protein with refseq references and gene with entrez id assert tp53_protein.external_references.refseq_np == 'NP_000537' assert tp53.entrez_id == 7157
def test_gene_init(): gene = Gene('BAD') assert gene.name == 'BAD'
def parser(line): # use name2 (fourth column from the end) name = line[-4] strand = line[3] assert strand in allowed_strands gene_data = { 'name': name, 'chrom': line[2][3:], # remove chr prefix 'strand': True if strand == '+' else False } if name.lower() not in genes: gene = Gene(**gene_data) genes[name.lower()] = gene else: gene = genes[name.lower()] for key, value in gene_data.items(): previous = getattr(gene, key) if previous != value: print('Replacing %s %s with %s (previously: %s)' % (gene, key, value, previous)) setattr(gene, key, value) # load protein refseq = line[1] # if protein is already in database no action is required if refseq in known_proteins: return # do not allow duplicates if refseq in proteins: with_duplicates.append(refseq) potentially_empty_genes.add(gene) """ if gene.chrom in ('X', 'Y'): # close an eye for pseudoautosomal regions print( 'Skipping duplicated entry (probably belonging', 'to pseudoautosomal region) with refseq:', refseq ) else: # warn about other duplicated records print( 'Skipping duplicated entry with refseq:', refseq ) """ return # from this line there is no processing of duplicates allowed assert refseq not in proteins protein_data = {'refseq': refseq, 'gene': gene} coordinates = zip( coordinates_names, [int(value) for i, value in enumerate(line) if i in columns]) protein_data.update(coordinates) proteins[refseq] = Protein(**protein_data)
def run(self, experiment: Experiment) -> ImpactAnalysisResult: """ Returns: list of pathways sorted by their impact factor. Each pathway in the list has values of FDR and Bonferroni corrections assigned. """ self.experiment_genes = set( [gene.name for gene in experiment.get_all().genes]) # calculate fold change self.FC = experiment.calculate_fold_change() # remove genes for witch fold change cannot be calculated correctly experiment.exclude_genes( list(self.FC['FC'][isnan(self.FC['FC'])].index)) if self.degs: self.degs = pd.Series({ Gene(x): True for x in self.degs if Gene(x) not in self.experiment_genes }) else: # select differentialy expressed genes pvalue = ttest(experiment) <= self.threshold self.degs = pvalue[pvalue == True] if self.degs.size == 0: # if there are no DEGs anywhere, the problem of finding the impact on various pathways is meaningless print('No differentialy expressed genes.') return ImpactAnalysisResult([]) db = KEGGPathways(self.org) pathways = {} for gene in [g.name for g in list(self.degs.index)]: ps = db.search_by_gene(gene) for (k, v) in ps.items(): if k not in pathways.keys(): pathways[k] = v if not pathways: print('No pathways found in database.') return ImpactAnalysisResult([]) res = pd.DataFrame(columns=['name', 'IF', 'pvalue']) for (code, descr) in pathways.items(): pathway = db.get_pathway(code) impact_factor, pval = self.calculate_impact_factor( experiment, pathway) if impact_factor is not None and pval is not None: res.loc[len(res.index)] = [descr, impact_factor, pval] res['FDR'], res['Bonferroni'] = self.calculate_corrections( res['pvalue']) ifp_pathways = [IAPathway(res.loc[i]) for i in range(len(res.index))] ifp_pathways.sort(key=lambda x: x.IF if not isnan(x.IF) else 0, reverse=True) result = ImpactAnalysisResult(ifp_pathways) if self.markdown: result.generate_markdown(self.markdown, 'Results of Impact Analysis:') return result
def test_mutation(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=1, sites=[s], sequence='A' * 15, gene=Gene(name='SomeGene')) db.session.add(p) from database import bdb muts = {13: 14370, 15: 14376} for aa_pos, dna_pos in muts.items(): muts[aa_pos] = Mutation(protein=p, position=aa_pos, alt='V') bdb.add_genomic_mut('20', dna_pos, 'G', 'A', muts[aa_pos], is_ptm=True) query_url = '/chromosome/mutation/{chrom}/{pos}/{ref}/{alt}' # query as a novel mutation response = self.client.get( query_url.format(chrom='chr20', pos=14370, ref='G', alt='A')) assert response.status_code == 200 assert response.json == [{ 'alt': 'V', 'gene': 'SomeGene', 'in_datasets': {}, 'pos': 13, 'ptm_impact': 'direct', 'cnt_ptm': 1, 'closest_sites': ['13 A'], 'protein': 'NM_007', 'sites': [{ 'kinases': [], 'position': 13, 'residue': 'A', 'kinase_groups': [], 'type': 'methylation' }], 'ref': 'A' }] # well let's look on a known mutation: m = muts[15] mc3 = MC3Mutation(mutation=m, cancer=Cancer(name='Breast invasive carcinoma', code='BRCA'), count=1) esp = ExomeSequencingMutation(mutation=m, maf_all=0.02, maf_aa=0.02) db.session.add_all([m, mc3, esp]) db.session.commit() mutation_a15v_query = query_url.format(chrom='chr20', pos=14376, ref='G', alt='A') response = self.client.get(mutation_a15v_query) metadata = { 'MC3': { 'Cancers': [{ 'Cancer': 'Breast invasive carcinoma', 'Value': 1 }] }, 'ESP6500': { 'MAF': 0.02, 'MAF AA': 0.02, 'MAF EA': None } } assert response.json[0]['in_datasets'] == metadata expected_values = {'MC3': 1, 'ESP6500': 0.02} # if user does not want to download data for all datasets he may use: for source, meta in metadata.items(): response = self.client.get(mutation_a15v_query + '?filters=Mutation.sources:in:' + source) json = response.json[0] assert json['in_datasets'] == {source: meta} assert json['value'] == expected_values[source] response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:MC3;Mutation.mc3_cancer_code:in:BRCA' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:African American' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:European American' ) assert not response.json
from database import db from app import create_app app = create_app() db.init_app(app) app.app_context().push() from models import Strain, Plasmid, Gene db.drop_all(bind=None) db.create_all() # Test strains strain1 = Strain("E. coli", "Andrew", "2019-10-24", "It's hungry", "yes") strain2 = Strain("S. aureus", "Andrew", "2019-10-23", "It's cool", "yes") # Test plasmids plasmid1 = Plasmid("ACTG", "TCTA", "Andrew", "2019-10-24", "some notes", "file.txt", "1,2") plasmid2 = Plasmid("TTCA", "GGTA", "Andrew", "2019-10-24", "some more notes", "file2.txt", "3,4") # Test genes gene1 = Gene("A happy gene", "CCCA", "Andrew", "2019-10-23", "notes here", "file3.txt") gene2 = Gene("A sad gene", "GTCA", "Andrew", "2019-10-23", "even more notes here", "file4.txt") db.session.add_all([strain1, strain2, plasmid1, plasmid2, gene1, gene2]) db.session.commit()
def test_mutated_sites(self): g = Gene(name='Gene X') p = Protein(refseq='NM_007', sequence='ABCDEFGHIJKLMNOPQRSTUVWXYZ', gene=g) g.preferred_isoform = p glycosylation = SiteType(name='glycosylation') sites = { # ClinVar muts and TCGA muts but different, with total count = 5 (3 + 2) 'A': Site(position=1, residue='A', protein=p), # ClinVar muts intersection TCGA muts, total count = 4 (2 + 2) 'K': Site(position=11, residue='K', protein=p), # Only TCGA muts, total count = 3 (1 + 2) 'U': Site(position=21, residue='U', protein=p, types={glycosylation}) } def mut(pos): return Mutation(position=pos, alt='X', protein=p) intersecting_mut = mut(11) mutations = [ # the first site (1 A) InheritedMutation( mutation=mut(1), clin_data=[ClinicalData(), ClinicalData(), ClinicalData()] ), MC3Mutation(mutation=mut(2), count=2), # the second site (11 K) InheritedMutation( mutation=intersecting_mut, clin_data=[ClinicalData(), ClinicalData()] ), MC3Mutation(mutation=intersecting_mut, count=2), # the third site (21 U) MC3Mutation(mutation=mut(20), count=1), MC3Mutation(mutation=mut(22), count=2), ] db.session.add_all(mutations) db.session.add_all([p, g]) db.session.add_all(sites.values()) db.session.commit() sites_with_clinvar = most_mutated_sites([InheritedMutation]).all() assert sites_with_clinvar == [(sites['A'], 3), (sites['K'], 2)] sites_with_mc3 = most_mutated_sites([MC3Mutation]).all() assert set(sites_with_mc3) == {(sites['A'], 2), (sites['K'], 2), (sites['U'], 3)} both_sources = [MC3Mutation, InheritedMutation] sites_with_muts_in_both_intersection = most_mutated_sites(both_sources, intersection=True).all() assert sites_with_muts_in_both_intersection == [(sites['K'], 4)] sites_with_muts_in_both = most_mutated_sites(both_sources, intersection=False).all() assert sites_with_muts_in_both == [(sites['A'], 5), (sites['K'], 4)] glyco_sites_with_mc3 = most_mutated_sites([MC3Mutation], site_type=glycosylation).all() assert glyco_sites_with_mc3 == [(sites['U'], 3)]