Example #1
0
def parseCSVFile(filename):
	user = User.query.filter_by(email = session['email']).first()
	uid = user.get_id()
	dbfile = File.query.filter_by(filename=filename).filter_by(user_id=uid).filter_by(location=app.config['DATA_FOLDER']).first()

	#  Read in data and commit to db
	fullfilename = os.path.join(app.config['DATA_FOLDER'],filename)
	with open(fullfilename, 'rb') as f:
		for line in csv.DictReader(f, delimiter=','):
			name = line['name']
			replicate = line['replicate']
			condition = line['condition']
			data = line['data']
			location = app.config['DATA_FOLDER']
			dbgene = Gene.query.filter_by(descr=name).filter_by(file_id=dbfile.get_id()).first()
			if not dbgene:
				dbgene = Gene(descr=name,file_id=dbfile.get_id())
				db.session.add(dbgene)
				db.session.commit()
			dbcond = Condition.query.filter_by(condition=condition,value=data, gene_id=dbgene.get_id(), replicate=replicate).first()
			if not dbcond:
				dbcond = Condition(condition=condition,value=data, gene_id=dbgene.get_id(), replicate=replicate)
				db.session.add(dbcond)
				db.session.commit()
	#db.session.commit()
	f.close()

	#  Now can set loaded to true
	dbfile.set_loaded()
Example #2
0
class searchRelationTestCase(TestCase):
	def setUp(self):
		self.gene = Gene()
		self.gene.gene_id = 'test'
		self.gene.name = 'test_gene'
		self.gene.ntseq_length = 0
		self.gene.save()

		self.paper_gene = Paper_Gene()
		self.paper_gene.paper_id = 'paper_id'
		self.paper_gene.paper_title = 'title'
		self.paper_gene.paper_link = 'www.test.com'
		self.paper_gene.gene = self.gene
		self.paper_gene.paper_keyword = ''
		self.paper_gene.save()

	
	def test_search_relation(self):
		search_result = search_relation('none')
		self.assertEqual(search_result, None)

	def test_search_genes(self):
		search_result = search_genes('none')
		self.assertEqual(search_result, None)

	def test_search_papers(self):
		search_result = search_papers('test')
		self.assertEqual(search_result, [])

	def test_search_one_sentence(self):
		search_result = search_one_sentence('name1', 'name2')
		self.assertEqual(search_result, [])

	def test_search_three_sentence(self):
		search_result = search_three_sentence('name1', 'name2')
		self.assertEqual(search_result, [])

	def test_search_related_disease(self):
		search_result = search_related_disease('name1')
		if search_result:
			return True
Example #3
0
	def setUp(self):
		self.gene = Gene()
		self.gene.gene_id = 'test'
		self.gene.name = 'test_gene'
		self.gene.ntseq_length = 0
		self.gene.save()

		self.paper_gene = Paper_Gene()
		self.paper_gene.paper_id = 'paper_id'
		self.paper_gene.paper_title = 'title'
		self.paper_gene.paper_link = 'www.test.com'
		self.paper_gene.gene = self.gene
		self.paper_gene.paper_keyword = ''
		self.paper_gene.save()
    def test_annotation(self):
        annotation = Annotation("COG0001")
        self.session.add(annotation)
        self.session.commit()

        assert Annotation.query.first() is annotation

        #Test the many to many relationship
        reference_assembly = ReferenceAssembly("version 1")
        gene = Gene("gene1", reference_assembly)
        gene2 = Gene("gene2", reference_assembly)
        gene3 = Gene("gene3", reference_assembly)

        annotation2 = Annotation("COG0002", description="This cog is really really good")
        # Test having multiple genes to one annotation
        annotation_source = AnnotationSource("Cog", "v1.0", "rpsblast", "e_value=0.000001")
        gene_annotation1 = GeneAnnotation(annotation_source = annotation_source, e_value=0.0000001)
        gene_annotation2 = GeneAnnotation(annotation_source = annotation_source)

        gene_annotation1.gene = gene
        gene_annotation2.gene = gene2

        gene_annotation1.annotation = annotation
        gene_annotation2.annotation = annotation

        self.session.add(annotation)
        self.session.add(gene3)
        self.session.add(gene_annotation1)
        self.session.add(gene_annotation2)
        self.session.commit()

        annotation_01 = Annotation.query.filter_by(type_identifier="COG0001").first()
        assert len(annotation_01.genes) == 2
        assert gene in annotation_01.genes
        assert gene2 in annotation_01.genes
        assert annotation in Gene.query.filter_by(name="gene1").first().annotations
        assert annotation in Gene.query.filter_by(name="gene2").first().annotations
        assert len(Gene.query.filter_by(name="gene3").first().annotations) == 0

        # Genes for annotation method
        genes_for_annotation = Annotation.genes_per_annotation([annotation.id])
        assert len(genes_for_annotation) == 2
        assert (gene, annotation) in genes_for_annotation
        assert (gene2, annotation)  in genes_for_annotation

        # Add the second annotation
        self.session.add(annotation2)
        self.session.commit()
        q =  Annotation.query.filter(Annotation.description.contains("good"))
        annotation_02 = q.all()
        assert len(annotation_02) == 1
        assert annotation_02[0] == annotation2

        # Test having multiple annotations to one gene
        gene_annotation3 = GeneAnnotation(annotation2, gene, annotation_source, e_value = 1e-14)
        self.session.add(gene_annotation3)
        self.session.commit()

        assert len(Gene.query.filter_by(name="gene1").first().annotations) == 2
        assert annotation in Gene.query.filter_by(name="gene1").first().annotations
        assert annotation2 in Gene.query.filter_by(name="gene1").first().annotations

        assert gene_annotation1.e_value > gene_annotation3.e_value
        assert gene.e_value_for(annotation) > gene.e_value_for(annotation2)

        # gene -> annotation
        # gene2 -> annotation
        # gene -> annotation2

        # Genes for annotation method
        genes_for_annotation = Annotation.genes_per_annotation([annotation.id])
        assert len(genes_for_annotation) == 2
        assert (gene, annotation) in genes_for_annotation
        assert (gene2, annotation) in genes_for_annotation

        genes_for_annotation = Annotation.genes_per_annotation([annotation2.id])
        assert len(genes_for_annotation) == 1
        assert (gene, annotation2) in genes_for_annotation

        genes_for_annotation = Annotation.genes_per_annotation([annotation.id, annotation2.id])
        assert len(genes_for_annotation) == 3
        assert (gene, annotation) in genes_for_annotation
        assert (gene, annotation2) in genes_for_annotation
        assert (gene2, annotation) in genes_for_annotation

        annotation3 = Annotation("COG0003", description=("This cog is really really good. I assure you, "
            "really quite good. Among its capabilities I have to mention that its utterly suitable for "
            "testing the description string, including the short description."))

        assert len(annotation3.description) > 103
        assert annotation3.short_description[-3:] == "..."
        assert len(annotation3.short_description) == 103
        assert annotation3.description[:100] == annotation3.short_description[:100]
    def test_taxon(self):
        ref_assembly = ReferenceAssembly("Version 1")
        gene1 = Gene("gene1", ref_assembly)

        sample1 = Sample("P1993_101", None, None)
        reference_assembly = ReferenceAssembly("version 1")
        gene_count1 = GeneCount(gene1, sample1, 0.001)
        taxon1 = Taxon(superkingdom="Bacteria", phylum="Proteobacteria")
        gene1.taxon = taxon1
        self.session.add(gene1)
        self.session.add(taxon1)
        self.session.add(sample1)
        self.session.add(gene_count1)
        self.session.commit()

        gene1 = Gene.query.first()
        taxon1 = Taxon.query.first()

        assert gene1.taxon == taxon1
        assert gene1 in taxon1.genes
        assert taxon1.superkingdom == 'Bacteria'
        assert taxon1.phylum == 'Proteobacteria'
        assert taxon1.taxclass == ''
        assert taxon1.full_taxonomy == 'Bacteria;Proteobacteria;;;;;;'
        refresh_all_mat_views()

        # Test sample count retreival
        sample2 = Sample("P1993_102", None, None)
        self.session.add(sample2)
        self.session.commit()
        refresh_all_mat_views()
        assert taxon1.rpkm == {sample1: 0.001}

        gene_count2 = GeneCount(gene1, sample2, 0.2)
        self.session.add(gene_count2)
        self.session.commit()
        refresh_all_mat_views()
        assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2}

        gene2 = Gene("gene2", ref_assembly)
        gene_count3 = GeneCount(gene2, sample2, 0.1)

        self.session.add(gene2)
        self.session.add(gene_count3)
        self.session.commit()
        refresh_all_mat_views()

        # taxon1.rpkm should still be the same since the new gene is not connected to taxon1
        assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2}

        taxon2 = Taxon(superkingdom="Eukaryota", phylum="Chlorophyta")
        gene2.taxon = taxon2
        self.session.add(taxon2)
        self.session.add(gene2)
        self.session.commit()
        refresh_all_mat_views()

        # Taxon2 should have gene_count3 stats only
        assert taxon2.rpkm == {sample2: 0.1}

        gene3 = Gene("gene3", ref_assembly, taxon_id=taxon1.id)
        gene_count4 = GeneCount(gene3, sample1, 1.0)

        self.session.add(gene3)
        self.session.add(gene_count4)
        self.session.commit()

        # Taxon1 should now have the original stats plus gene_count4
        assert taxon1.rpkm == {sample1: 1.001, sample2: 0.2}


        taxon3 = Taxon(superkingdom="Eukaryota", phylum="Unnamed", taxclass="Dinophyceae")
        self.session.add(taxon3)
        self.session.commit()
        gene4 = Gene("gene4", ref_assembly, taxon_id=taxon3.id)
        gene_count5 = GeneCount(gene4, sample2, 0.003)

        self.session.add(gene4)
        self.session.add(gene_count5)
        self.session.commit()
        refresh_all_mat_views()

        # theoretical rpkm_table:
        # samples = [sample1, sample2]
        # rpkm_table = {"Bacteria": {"P1993_101": 1.001, "P1993_102": 0.2}, "Eukaryota": {"P1993_102": 0.103}}
        samples, rpkm_table, complete_val_to_val = Taxon.rpkm_table()
        assert samples == [sample1, sample2]
        assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Bacteria", "Eukaryota"] # Sorted by summed rpkm
        assert rpkm_table[("Bacteria")] == {sample1: 1.001, sample2: 0.2}
        assert rpkm_table[("Eukaryota")] == {sample2: 0.103}

        samples, rpkm_table, complete_val_to_val= Taxon.rpkm_table(level='phylum')
        assert samples == [sample1, sample2]
        assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Proteobacteria", "Chlorophyta", "Unnamed"] # Sorted by summed rpkm

        assert rpkm_table[("Bacteria;Proteobacteria")] == {sample1: 1.001, sample2: 0.2}
        assert rpkm_table[("Eukaryota;Chlorophyta")] == {sample2: 0.1}
        assert rpkm_table[("Eukaryota;Unnamed")] == {sample2: 0.003}
Example #6
0
    def test_autocomplete_all(self):

        # MC3 GeneList is required as a target (a href for links) where users will be pointed
        # after clicking of cancer autocomplete suggestion. Likewise with the ClinVar list.
        db.session.add_all([
            GeneList(name=name, mutation_source_name=detail_class.name)
            for name, detail_class in [
                ('TCGA', MC3Mutation), ('ClinVar', InheritedMutation)
            ]
        ])

        g = Gene(name='BR')
        p = Protein(id=1, refseq='NM_007', gene=g, sequence='XXXXXV')
        g.preferred_isoform = p     # required for gene search to work - genes without preferred isoforms are ignored
        mut = Mutation(protein=p, position=6, alt='E')
        db.session.add_all([mut, p, g])

        def autocomplete(query):
            r = self.client.get('/search/autocomplete_all/?q=' + query)
            self.visit_returned_urls(r)
            return r

        from database import bdb_refseq, bdb
        bdb_refseq['BR V6E'] = [p.id]  # required for mutation search
        bdb.add_genomic_mut('1', 10000, 'T', 'C', mut)

        # Gene and mutations

        response = autocomplete('BR V6E')
        entry = get_entry_and_check_type(response, 'aminoacid mutation')
        assert entry

        response = autocomplete('BR V6')
        entry = get_entry_and_check_type(response, 'message')
        assert 'Awaiting for <code>{alt}</code>' in entry['name']

        response = autocomplete('BR V')
        entry = get_entry_and_check_type(response, 'message')
        assert 'Awaiting for <code>{pos}{alt}</code>' in entry['name']

        response = autocomplete('B')
        entry = get_entry_and_check_type(response, 'gene')
        assert 'BR' == entry['name']

        # genomic mutation
        response = autocomplete('chr1 10000 T C')
        entry = get_entry_and_check_type(response, 'nucleotide mutation')
        assert entry and entry['input'] == 'CHR1 10000 T C'

        # is the search falling back to the other strand?
        response = autocomplete('chr1 10000 A G')
        entry = get_entry_and_check_type(response, 'nucleotide mutation')
        assert entry and entry['input'] == 'complement of CHR1 10000 A G'

        prompt = 'Awaiting for mutation in <code>{chrom} {pos} {ref} {alt}</code> format'

        for prompt_invoking_query in ['chr1', 'chr1 ', 'chr1 40', 'chr1 40 ', 'chr1 40 T']:
            response = autocomplete(prompt_invoking_query)
            entry = get_entry_and_check_type(response, 'message')
            assert entry['name'] == prompt

        # Pathways

        pathways = [
            Pathway(description='Activation of RAS in B cells', reactome=1169092),
            Pathway(description='abortive mitotic cell cycle', gene_ontology=33277),
            Pathway(description='amacrine cell differentiation', gene_ontology=35881),
            Pathway(description='amniotic stem cell differentiation', gene_ontology=97086)
        ]

        db.session.add_all(pathways)

        # test partial matching and Reactome id pathways search
        for ras_activation_query in ['Activation', 'REAC:1', 'REAC:1169092']:
            response = autocomplete(ras_activation_query)
            entry = get_entry_and_check_type(response, 'pathway')
            assert entry['name'].startswith('Activation of RAS in B cells')

        # test Gene Ontology search:
        response = autocomplete('GO:33')
        go_pathway = get_entry_and_check_type(response, 'pathway')
        assert go_pathway['name'] == 'abortive mitotic cell cycle (GO:33277)'

        # check if multiple pathways are returned
        response = autocomplete('differentiation')
        assert len(response.json['entries']) == 2

        # check if both genes an pathways are returned simultaneously
        # there should be: a pathway ('a>b<ortive...') and the >B<R gene
        response = autocomplete('b')
        entries = response.json['entries']
        names = [entry['name'] for entry in entries]
        assert all([name in names for name in ['BR', 'abortive mitotic cell cycle']])

        # check if "search more pathways" is displayed
        response = autocomplete('cell')    # cell occurs in all four of added pathways;
        # as a limit of pathways shown is 3, we should get a "show more" link
        links = entries_with_type(response, 'see_more')
        assert len(links) == 1
        assert links[0]['name'] == 'Show all pathways matching <i>cell</i>'

        # test case insensitive text search
        response = autocomplete('AMNIOTIC STEM')
        pathways = entries_with_type(response, 'pathway')
        assert len(pathways) == 1
        assert pathways[0]['name'] == 'amniotic stem cell differentiation'

        # Disease
        disease_names = [
            'Cystic fibrosis', 'Polycystic kidney disease 2',
            'Frontotemporal dementia', 'Cataract, nuclear total'
        ]
        diseases = {name: Disease(name=name) for name in disease_names}
        db.session.add_all(diseases.values())

        response = autocomplete('cystic')
        cystic_matching = entries_with_type(response, 'disease')
        # both 'Cystic fibrosis' and PKD2 should match
        assert len(cystic_matching) == 2

        # is comma containing disease name properly linked?
        response = autocomplete('Cataract')
        cataract = get_entry_and_check_type(response, 'disease')
        assert cataract['name'] == 'Cataract, nuclear total'

        # Gene mutation in disease

        # test suggestions
        response = autocomplete('cystic ')
        entry = entries_with_type(response, 'message')[0]
        assert re.match('Do you wish to search for (.*?) mutations\?', entry['name'])

        # currently there are no mutations associated with any disease
        # so the auto-completion should not return any results
        response = autocomplete('cystic in ')
        assert not response.json['entries']

        # let's add a mutation
        m = Mutation(protein=p, position=1, alt='Y')
        bdb_refseq['BR X1Y'] = ['NM_007']
        # note: sig_code is required here
        data = ClinicalData(disease=diseases['Cystic fibrosis'], sig_code=1)
        disease_mutation = InheritedMutation(mutation=m, clin_data=[data])
        db.session.add_all([m, data, disease_mutation])

        # should return '.. in BR' suggestion now.
        for query in ['cystic in', 'cystic in ']:
            response = autocomplete(query)
            result = get_entry_and_check_type(response, 'disease_in_protein')
            assert result['gene'] == 'BR'
            assert result['name'] == 'Cystic fibrosis'

        # both gene search and refseq search should yield the same, non-empty results
        results = []

        for query in ['cystic in BR', 'cystic in NM_007', 'cystic in 007']:
            response = autocomplete(query)
            result = get_entry_and_check_type(response, 'disease_in_protein')
            results.append(result)

        assert all(r == result for r in results) and result
Example #7
0
def create_test_protein():

    g_x = Gene(name='Gene X')
    p = Protein(refseq='NM_0007', gene=g_x, sequence='TRAN')
    return p
Example #8
0
    def execute_gene(self, feature_rows, strain_id):
        features = {}
        sequence = None
        transcript = None

        gene_id = None
        min_start = None
        max_end = None

        for feature_row in feature_rows: # Loop through annotation rows in the gff file, all related to the current gene

            # keep track of start and end
            start = feature_row[3]
            end = feature_row[4]
            direction = "forward" if feature_row[6] == "+" else "reverse"
            chromosome_id = feature_row[0]

            feature_type = feature_row[2]
            attribs = feature_row[8].strip()

            # This causes bugs.
            # if feature_type == "gene": # Handle gene entries
                # gene_id = attribs.split(";")[0].split(":")[1] # grab the gene ID - we'll want this for later

            new_gene_id = self.find_attribs_value("ID=Gene", attribs)
            if new_gene_id != None:

                # only deal with proper genes. setting gene_id to None means nothing else will be processed.
                # so it will essentially skip non-"gene" entries.
                if feature_type != "gene":
                    gene_id = None
                    continue

                # Check against filter list if there is one
                if self.filter_genes != None and new_gene_id not in self.filter_genes:
                    # filter list exists, and gene is not in filter list
                    # skip this gene
                    return

                gene_id = new_gene_id

                # add the Gene entry - if it hasn't been already
                if gene_id not in self.genes_seen: 
                    gene = Gene(gene_id)
                    self.genes_to_write.append(gene)
                    self.genes_seen[gene_id] = gene
            
            elif gene_id != None : # Handle transcript entries - if the gene is legit
                transcript_id = self.find_attribs_value("ID=Transcript", attribs)
                if transcript_id != None: # it's a transcript entry

                    # add the Transcript entry - if it hasn't been already
                    transcript_id = self.ensure_unique_transcript_id(transcript_id)

                    if transcript_id not in self.transcripts_seen: 
                        transcript = Transcript(
                            id=transcript_id, gene_id=gene_id
                        )
                        self.transcripts_to_write.append(transcript)
                        self.transcripts_seen[transcript.id] = transcript

                else: # Handle transcript feature entries

                    # for some reason, features for a given strain/transcript 
                    # combination are not always added

                    transcript_id = self.find_attribs_value("Parent=Transcript", attribs)

                    if transcript_id != None: # it's a transcript feature entry
                        # put a filter here? some elements are not worth storing?
                        self.features_to_write.append(Feature(
                            transcript_id=transcript_id,
                            type_id=feature_row[2],
                            strain_id=strain_id,
                            chromosome_id=chromosome_id,
                            start=start,
                            end=end,
                            direction=direction
                        ))

                    else:
                        pass # this happens for pseudogenes and TEs - which we aint interested in
Example #9
0
def test_gene_set():
    gene_set = GeneSet('set', ['MDM2', 'TP53'])
    assert Gene('TP53') in gene_set
    assert str(gene_set) == '<GeneSet: set with 2 genes>'
Example #10
0
def iterate_gtex_vs_spidex(strict=False,
                           tissues=GTEX_TISSUES,
                           location=None,
                           filters=None,
                           no_further_than_x_from_gene=None):
    """
    Yield records representing GTEx-SPIDEX pairs which are matching
    (the same position, reference and alternative alleles).

    Args:
        no_further_than_x_from_gene: if 0, accept only mutations acting on their gene
        strict: should critical data discrepancies raise errors or be collected as statistics?
        tissues: list of tissues to be used
        location: 'intronic' or 'exonic' - filters SPIDEX records
        filters: dict - filtering criteria for GTEx records.
    
    In spidex there are only SNPs (single!)

    Definitions for GTEx (from http://www.gtexportal.org/home/documentationPage):
        The effect size of the eQTLs is defined as the slope of the linear regression,
        and is computed as the effect of the alternative allele (ALT) relative to the
        reference allele (REF) in the human genome reference GRCh37/hg19
        (i.e., the eQTL effect allele is the ALT allele).


    Definitions for SPIDEX (more in spidex/README):
        dpsi_max_tissue: The delta PSI. This is the predicted change in
                         percent-inclusion due to the variant, reported
                         as the maximum across tissues (in percent).
        dpsi_zscore: This is the z-score of dpsi_max_tissue relative to the
                     distribution of dPSI that are due to common SNP.

        ref_allele: The reference allele at the variant position (forward-strand)
        mut_allele: The mutant allele at the variant position (forward-strand)

    """

    # Use "Brain cortex" for basic tests - it's very small
    # tissues = ['Brain_Cortex']
    # Use "Adipose Subcutaneous" for larger tests
    # tissues = ['Adipose_Subcutaneous']

    path = create_path_for_genes_db(tissues)
    genes = ExpressedGenes(path)
    genes.reset()

    import_expressed_genes(genes, tissues=tissues)

    tb = tabix.open(SPIDEX_LOCATION)

    count = count_all(tissues)

    counter = Counter()

    for mutation_code, tissue, slope, ensembl_gene_id in tqdm(
            iterate_over_expression(tissues), total=count):
        chrom, pos, ref, alt, _ = mutation_code.split('_')

        # In spidex there are only SNPs (single!)
        if len(ref) != 1 or len(alt) != 1:
            counter['not_single'] += 1
            continue

        pos = int(pos)

        gene = Gene(*genes[ensembl_gene_id])

        if not gene:
            print('gene %s not present in data' % ensembl_gene_id)
            continue

        if no_further_than_x_from_gene is not None:
            if not (gene.start - no_further_than_x_from_gene <= pos <=
                    gene.end + no_further_than_x_from_gene):
                counter['not_within_requested_gene_span'] += 1
                continue

        variant = SingleAltVariant(chr_name=chrom,
                                   chr_start=pos,
                                   chr_end=pos,
                                   chr_strand=gene.strand,
                                   snp_id='-',
                                   ref=convert_to_strand(ref, gene.strand),
                                   alt=convert_to_strand(alt, gene.strand),
                                   gene=ensembl_gene_id)

        records = spidex_get_variant(tb, variant)
        if filters:
            records = [
                record for record in records if all(
                    getattr(record, key) == value
                    for key, value in filters.items())
            ]

        record = None

        try:
            # if genes are the same there is no need to test strands, but its better to double check
            record = choose_record(records,
                                   variant,
                                   variant.alt,
                                   strict=strict,
                                   test_strand=True,
                                   location=location)
        except StrandMismatch:
            counter['strand_mismatch'] += 1
        except Intronic:
            counter['intronic'] += 1
        except ToManyRecords:
            counter['to_many_records'] += 1

        if record:
            if gene.name != record.gene:
                counter['gene_name_mismatch'] += 1
                continue

            variant.refseq_transcript = record.transcript

            yield variant, tissue, slope, record, gene
        else:
            counter['Not found in SPIDEX'] += 1

    if strict:
        print(counter)
Example #11
0
    def test_protein_references(self):

        uniprot_filename = make_named_temp_file(data=idmapping_dat,
                                                opener=gzip.open,
                                                mode='wt')
        reflink_filename = make_named_temp_file(data=reflink_data,
                                                opener=gzip.open,
                                                mode='wt',
                                                suffix='.gz')
        refseq_filename = make_named_temp_file(data=refseq_data)

        refseqs = [
            'NM_011739',  # present in reference mappings
            'NM_001131572',  # present
            'NM_201200',  # present
            'NM_0001'  # not present in reference mappings
        ]

        g = Gene(name='Some gene')
        proteins_we_have = {
            refseq_nm: Protein(refseq=refseq_nm, gene=g)
            for refseq_nm in refseqs
        }

        tp53 = Gene(name='TP53')
        tp53_protein = Protein(refseq='NM_000546', gene=tp53)

        with self.app.app_context():
            # let's pretend that we already have some proteins in our db
            db.session.add_all(proteins_we_have.values())
            db.session.add(tp53_protein)

            references = load_external_references(uniprot_filename,
                                                  refseq_filename,
                                                  reflink_filename)

            # there are 3 references we would like to have extracted
            assert len(references) == 3

            protein = proteins_we_have['NM_011739']

            assert len(protein.external_references.uniprot_entries) == 2
            uniprot_entry = protein.external_references.uniprot_entries[1]
            assert uniprot_entry.accession == 'P68254'
            assert uniprot_entry.isoform == 1
            assert uniprot_entry.reviewed is True

            uniprot_entry = protein.external_references.uniprot_entries[0]
            assert uniprot_entry.reviewed is False

            ensembl_peptides = protein.external_references.ensembl_peptides

            assert len(ensembl_peptides) == 2
            assert (set(ensembl.peptide_id
                        for ensembl in ensembl_peptides) == {
                            'ENSMUSP00000106602', 'ENSMUSP00000100067'
                        })

            protein = proteins_we_have['NM_001131572']

            assert len(protein.external_references.uniprot_entries) == 1
            uniprot_entry = protein.external_references.uniprot_entries[0]
            assert uniprot_entry.accession == 'Q5RFJ2'
            assert uniprot_entry.isoform == 1
            assert uniprot_entry.reviewed is False

            # check if protein without references stays clear
            protein = proteins_we_have['NM_0001']

            # it's needed to re-add the protein cause ORM will emit a query
            # (just in case, that's how flask-testing works - any object needs
            # to be re-added to session after its termination)
            db.session.add(protein)

            assert protein.external_references is None

            # check the protein with refseq references and gene with entrez id
            assert tp53_protein.external_references.refseq_np == 'NP_000537'
            assert tp53.entrez_id == 7157
Example #12
0
def test_gene_init():
    gene = Gene('BAD')
    assert gene.name == 'BAD'
Example #13
0
    def parser(line):

        # use name2 (fourth column from the end)
        name = line[-4]

        strand = line[3]
        assert strand in allowed_strands

        gene_data = {
            'name': name,
            'chrom': line[2][3:],  # remove chr prefix
            'strand': True if strand == '+' else False
        }

        if name.lower() not in genes:
            gene = Gene(**gene_data)
            genes[name.lower()] = gene
        else:
            gene = genes[name.lower()]
            for key, value in gene_data.items():
                previous = getattr(gene, key)
                if previous != value:
                    print('Replacing %s %s with %s (previously: %s)' %
                          (gene, key, value, previous))
                    setattr(gene, key, value)

        # load protein
        refseq = line[1]

        # if protein is already in database no action is required
        if refseq in known_proteins:
            return

        # do not allow duplicates
        if refseq in proteins:

            with_duplicates.append(refseq)
            potentially_empty_genes.add(gene)
            """
            if gene.chrom in ('X', 'Y'):
                # close an eye for pseudoautosomal regions
                print(
                    'Skipping duplicated entry (probably belonging',
                    'to pseudoautosomal region) with refseq:', refseq
                )
            else:
                # warn about other duplicated records
                print(
                    'Skipping duplicated entry with refseq:', refseq
                )
            """
            return

        # from this line there is no processing of duplicates allowed
        assert refseq not in proteins

        protein_data = {'refseq': refseq, 'gene': gene}

        coordinates = zip(
            coordinates_names,
            [int(value) for i, value in enumerate(line) if i in columns])
        protein_data.update(coordinates)

        proteins[refseq] = Protein(**protein_data)
    def run(self, experiment: Experiment) -> ImpactAnalysisResult:
        """

        Returns:
            list of pathways sorted by their impact factor. Each pathway in the list has values of FDR and
            Bonferroni corrections assigned.
        """
        self.experiment_genes = set(
            [gene.name for gene in experiment.get_all().genes])

        # calculate fold change
        self.FC = experiment.calculate_fold_change()

        # remove genes for witch fold change cannot be calculated correctly
        experiment.exclude_genes(
            list(self.FC['FC'][isnan(self.FC['FC'])].index))

        if self.degs:
            self.degs = pd.Series({
                Gene(x): True
                for x in self.degs if Gene(x) not in self.experiment_genes
            })
        else:
            # select differentialy expressed genes
            pvalue = ttest(experiment) <= self.threshold
            self.degs = pvalue[pvalue == True]

        if self.degs.size == 0:
            # if there are no DEGs anywhere, the problem of finding the impact on various pathways is meaningless
            print('No differentialy expressed genes.')
            return ImpactAnalysisResult([])

        db = KEGGPathways(self.org)
        pathways = {}

        for gene in [g.name for g in list(self.degs.index)]:
            ps = db.search_by_gene(gene)
            for (k, v) in ps.items():
                if k not in pathways.keys():
                    pathways[k] = v

        if not pathways:
            print('No pathways found in database.')
            return ImpactAnalysisResult([])

        res = pd.DataFrame(columns=['name', 'IF', 'pvalue'])
        for (code, descr) in pathways.items():
            pathway = db.get_pathway(code)
            impact_factor, pval = self.calculate_impact_factor(
                experiment, pathway)
            if impact_factor is not None and pval is not None:
                res.loc[len(res.index)] = [descr, impact_factor, pval]

        res['FDR'], res['Bonferroni'] = self.calculate_corrections(
            res['pvalue'])
        ifp_pathways = [IAPathway(res.loc[i]) for i in range(len(res.index))]
        ifp_pathways.sort(key=lambda x: x.IF if not isnan(x.IF) else 0,
                          reverse=True)

        result = ImpactAnalysisResult(ifp_pathways)
        if self.markdown:
            result.generate_markdown(self.markdown,
                                     'Results of Impact Analysis:')
        return result
Example #15
0
    def test_mutation(self):

        s = Site(position=13, types={SiteType(name='methylation')})
        p = Protein(refseq='NM_007',
                    id=1,
                    sites=[s],
                    sequence='A' * 15,
                    gene=Gene(name='SomeGene'))

        db.session.add(p)

        from database import bdb

        muts = {13: 14370, 15: 14376}

        for aa_pos, dna_pos in muts.items():
            muts[aa_pos] = Mutation(protein=p, position=aa_pos, alt='V')
            bdb.add_genomic_mut('20',
                                dna_pos,
                                'G',
                                'A',
                                muts[aa_pos],
                                is_ptm=True)

        query_url = '/chromosome/mutation/{chrom}/{pos}/{ref}/{alt}'

        # query as a novel mutation
        response = self.client.get(
            query_url.format(chrom='chr20', pos=14370, ref='G', alt='A'))

        assert response.status_code == 200
        assert response.json == [{
            'alt':
            'V',
            'gene':
            'SomeGene',
            'in_datasets': {},
            'pos':
            13,
            'ptm_impact':
            'direct',
            'cnt_ptm':
            1,
            'closest_sites': ['13 A'],
            'protein':
            'NM_007',
            'sites': [{
                'kinases': [],
                'position': 13,
                'residue': 'A',
                'kinase_groups': [],
                'type': 'methylation'
            }],
            'ref':
            'A'
        }]

        # well let's look on a known mutation:
        m = muts[15]
        mc3 = MC3Mutation(mutation=m,
                          cancer=Cancer(name='Breast invasive carcinoma',
                                        code='BRCA'),
                          count=1)
        esp = ExomeSequencingMutation(mutation=m, maf_all=0.02, maf_aa=0.02)

        db.session.add_all([m, mc3, esp])
        db.session.commit()

        mutation_a15v_query = query_url.format(chrom='chr20',
                                               pos=14376,
                                               ref='G',
                                               alt='A')
        response = self.client.get(mutation_a15v_query)

        metadata = {
            'MC3': {
                'Cancers': [{
                    'Cancer': 'Breast invasive carcinoma',
                    'Value': 1
                }]
            },
            'ESP6500': {
                'MAF': 0.02,
                'MAF AA': 0.02,
                'MAF EA': None
            }
        }

        assert response.json[0]['in_datasets'] == metadata

        expected_values = {'MC3': 1, 'ESP6500': 0.02}

        # if user does not want to download data for all datasets he may use:
        for source, meta in metadata.items():
            response = self.client.get(mutation_a15v_query +
                                       '?filters=Mutation.sources:in:' +
                                       source)
            json = response.json[0]
            assert json['in_datasets'] == {source: meta}
            assert json['value'] == expected_values[source]

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:MC3;Mutation.mc3_cancer_code:in:BRCA'
        )
        assert response.json

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:African American'
        )
        assert response.json

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:European American'
        )
        assert not response.json
from database import db
from app import create_app
app = create_app()
db.init_app(app)
app.app_context().push()
from models import Strain, Plasmid, Gene

db.drop_all(bind=None)
db.create_all()

# Test strains
strain1 = Strain("E. coli", "Andrew", "2019-10-24", "It's hungry", "yes")
strain2 = Strain("S. aureus", "Andrew", "2019-10-23", "It's cool", "yes")

# Test plasmids
plasmid1 = Plasmid("ACTG", "TCTA", "Andrew", "2019-10-24", "some notes",
                   "file.txt", "1,2")
plasmid2 = Plasmid("TTCA", "GGTA", "Andrew", "2019-10-24", "some more notes",
                   "file2.txt", "3,4")

# Test genes
gene1 = Gene("A happy gene", "CCCA", "Andrew", "2019-10-23", "notes here",
             "file3.txt")
gene2 = Gene("A sad gene", "GTCA", "Andrew", "2019-10-23",
             "even more notes here", "file4.txt")

db.session.add_all([strain1, strain2, plasmid1, plasmid2, gene1, gene2])
db.session.commit()
Example #17
0
    def test_mutated_sites(self):

        g = Gene(name='Gene X')
        p = Protein(refseq='NM_007', sequence='ABCDEFGHIJKLMNOPQRSTUVWXYZ', gene=g)
        g.preferred_isoform = p

        glycosylation = SiteType(name='glycosylation')

        sites = {
            # ClinVar muts and TCGA muts but different, with total count = 5 (3 + 2)
            'A': Site(position=1, residue='A', protein=p),
            # ClinVar muts intersection TCGA muts, total count = 4 (2 + 2)
            'K': Site(position=11, residue='K', protein=p),
            # Only TCGA muts, total count = 3 (1 + 2)
            'U': Site(position=21, residue='U', protein=p, types={glycosylation})
        }

        def mut(pos):
            return Mutation(position=pos, alt='X', protein=p)

        intersecting_mut = mut(11)

        mutations = [
            # the first site (1 A)
            InheritedMutation(
                mutation=mut(1),
                clin_data=[ClinicalData(), ClinicalData(), ClinicalData()]
            ),
            MC3Mutation(mutation=mut(2), count=2),
            # the second site (11 K)
            InheritedMutation(
                mutation=intersecting_mut,
                clin_data=[ClinicalData(), ClinicalData()]
            ),
            MC3Mutation(mutation=intersecting_mut, count=2),
            # the third site (21 U)
            MC3Mutation(mutation=mut(20), count=1),
            MC3Mutation(mutation=mut(22), count=2),
        ]

        db.session.add_all(mutations)
        db.session.add_all([p, g])
        db.session.add_all(sites.values())
        db.session.commit()

        sites_with_clinvar = most_mutated_sites([InheritedMutation]).all()
        assert sites_with_clinvar == [(sites['A'], 3), (sites['K'], 2)]

        sites_with_mc3 = most_mutated_sites([MC3Mutation]).all()
        assert set(sites_with_mc3) == {(sites['A'], 2), (sites['K'], 2), (sites['U'], 3)}

        both_sources = [MC3Mutation, InheritedMutation]

        sites_with_muts_in_both_intersection = most_mutated_sites(both_sources, intersection=True).all()
        assert sites_with_muts_in_both_intersection == [(sites['K'], 4)]

        sites_with_muts_in_both = most_mutated_sites(both_sources, intersection=False).all()
        assert sites_with_muts_in_both == [(sites['A'], 5), (sites['K'], 4)]

        glyco_sites_with_mc3 = most_mutated_sites([MC3Mutation], site_type=glycosylation).all()
        assert glyco_sites_with_mc3 == [(sites['U'], 3)]