def test_add_results_to_record(self): pfams = { 'PF00015.2': FeatureLocation(0, 3), 'PF00351.1': FeatureLocation(0, 3), 'PF00015.27': FeatureLocation(3, 6) } fake_record = set_dummy_with_pfams(pfams) fake_duplicate_pfam = PFAMDomain(location=FeatureLocation(6, 9), description='DUPLICATE', protein_start=0, protein_end=5, identifier="PF00015.2", tool="test") fake_duplicate_pfam.domain_id = 'DUPLICATE' fake_record.add_pfam_domain(fake_duplicate_pfam) assert fake_duplicate_pfam in fake_record.get_pfam_domains() gos_for_fake_pfam = pfam2go.get_gos_for_pfams(fake_record) fake_results = pfam2go.Pfam2GoResults(fake_record.id, gos_for_fake_pfam) fake_results.add_to_record(fake_record) assert fake_duplicate_pfam.full_identifier == 'PF00015.2' for pfam in fake_record.get_pfam_domains(): assert sorted(pfam.gene_ontologies.ids) == sorted( fake_results.get_all_gos(pfam)) # make sure identical pfams (with different version numbers) all have the same gene ontologies if pfam.identifier == "PF00015": assert pfam.version in [2, 27] assert sorted(pfam.gene_ontologies.ids) == sorted( fake_results.get_all_gos(fake_duplicate_pfam))
def setUp(self): self.config = build_config(["--cf-create-clusters", "--cf-mean-threshold", "0.6", "--cf-min-cds", "5", "--cf-min-pfams", "5"], modules=[clusterfinder], isolated=True) update_config({"enabled_cluster_types": []}) self.record = DummyRecord(seq=Seq("A" * 2000)) for start, end, probability, pfam_id in [(10, 20, 0.1, 'PF77777'), (30, 40, 0.3, 'PF00106'), (50, 60, 0.4, 'PF00107'), (60, 70, 0.7, 'PF00109'), (70, 80, 0.98, 'PF08484'), (90, 100, 0.8, 'PF02401'), (100, 110, 0.32, 'PF04369'), (110, 120, 1.0, 'PF00128'), (130, 140, 0.2, 'PF77776'), (500, 505, None, 'PF77775'), (1010, 1020, 0.1, 'PF77774'), (1030, 1040, 0.3, 'PF00106'), (1050, 1060, 0.4, 'PF00107'), (1060, 1070, 0.7, 'PF00109'), (1070, 1080, 0.98, 'PF08484'), (1090, 1100, 0.8, 'PF02401'), (1100, 1110, 0.32, 'PF04369'), (1110, 1120, 1.0, 'PF00128')]: location = FeatureLocation(start, end, strand=1) self.record.add_cds_feature(CDSFeature(location, locus_tag=str(start), translation="A")) pfam = PFAMDomain(location, "dummy_description", protein_start=start + 1, protein_end=end-1, identifier=pfam_id, tool="test") pfam.domain_id = "pfam_%d" % start pfam.probability = probability self.record.add_pfam_domain(pfam)
def test_pfam_domain(self): original = PFAMDomain(FeatureLocation(2, 5), description="test", protein_start=5, protein_end=10, identifier="PF00002.17", domain="p450", tool="toolname") original.domain_id = "domain_id" original.database = "db" original.detection = "someprogram" original.evalue = 1e-5 original.score = 5. original.locus_tag = "locus" original.label = "somelabel" original.translation = "ARNDCQ" original.gene_ontologies = GOQualifier({ 'GO:0004871': 'signal transducer activity', 'GO:0007165': 'signal transduction', 'GO:0016020': 'membrane' }) new = PFAMDomain.from_biopython(original.to_biopython()[0]) for slot in [ "tool", "domain_id", "database", "detection", "evalue", "score", "locus_tag", "label", "translation", "domain", "protein_start", "protein_end", "identifier", "version" ]: assert getattr(original, slot) == getattr(new, slot) assert original.gene_ontologies.go_entries == new.gene_ontologies.go_entries assert original.full_identifier == new.full_identifier
def set_dummy_with_pfams(pfam_ids: Dict[str, FeatureLocation]) -> DummyRecord: pfam_domains = [] for pfam_id, pfam_location in pfam_ids.items(): pfam_domain = PFAMDomain(location=pfam_location, description='FAKE', protein_start=0, protein_end=5, identifier=pfam_id, tool="test") pfam_domain.domain_id = '%s.%d.%d' % (pfam_id, pfam_location.start, pfam_location.end) pfam_domains.append(pfam_domain) return DummyRecord(features=pfam_domains)
def add_to_record(self, record: Record) -> None: db_version = pfamdb.get_db_version_from_path(self.database) for i, hit in enumerate(self.hits): protein_location = FeatureLocation(hit["protein_start"], hit["protein_end"]) pfam_feature = PFAMDomain(location_from_string(hit["location"]), description=hit["description"], protein_location=protein_location, identifier=hit["identifier"], tool=self.tool, locus_tag=hit["locus_tag"]) for key in ["label", "locus_tag", "domain", "evalue", "score", "translation"]: setattr(pfam_feature, key, hit[key]) pfam_feature.database = db_version pfam_feature.detection = "hmmscan" pfam_feature.domain_id = "{}_{}_{:04d}".format(self.tool, pfam_feature.locus_tag, i + 1) record.add_pfam_domain(pfam_feature)
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam_location = FeatureLocation(0, 12) fake_pfam = PFAMDomain(location=fake_pfam_location, description='MCPsignal', protein_start=0, protein_end=5, identifier="PF00000", tool="test") fake_pfam.domain_id = 'BLANK' blank_no_ids.add_pfam_domain(fake_pfam) assert not pfam2go.get_gos_for_pfams(blank_no_pfams) assert not pfam2go.get_gos_for_pfams(blank_no_ids)
def add_to_record(self, record: Record) -> None: """ Adds the hits as PFAMDomains to the given record """ db_version = pfamdb.get_db_version_from_path(self.database) for i, hit in enumerate(self.hits): protein_location = FeatureLocation(hit.protein_start, hit.protein_end) pfam_feature = PFAMDomain(location_from_string(hit.location), description=hit.description, protein_location=protein_location, identifier=hit.identifier, tool=self.tool, locus_tag=hit.locus_tag) for key in ["label", "locus_tag", "domain", "evalue", "score", "translation"]: setattr(pfam_feature, key, getattr(hit, key)) pfam_feature.database = db_version pfam_feature.detection = "hmmscan" pfam_feature.domain_id = "{}_{}_{:04d}".format(self.tool, pfam_feature.locus_tag, i + 1) record.add_pfam_domain(pfam_feature)