def setUp(self): self.config = build_config([ "--cf-create-clusters", "--cf-mean-threshold", "0.6", "--cf-min-cds", "5", "--cf-min-pfams", "5" ], modules=[clusterfinder], isolated=True) update_config({"enabled_cluster_types": []}) self.record = DummyRecord(seq=Seq("A" * 2000)) for start, end, probability, pfam_id in [(10, 20, 0.1, 'FAKE007'), (30, 40, 0.3, 'PF00106'), (50, 60, 0.4, 'PF00107'), (60, 70, 0.7, 'PF00109'), (70, 80, 0.98, 'PF08484'), (90, 100, 0.8, 'PF02401'), (100, 110, 0.32, 'PF04369'), (110, 120, 1.0, 'PF00128'), (130, 140, 0.2, 'FAKE234'), (500, 505, None, 'FAKE505'), (1010, 1020, 0.1, 'FAKE007'), (1030, 1040, 0.3, 'PF00106'), (1050, 1060, 0.4, 'PF00107'), (1060, 1070, 0.7, 'PF00109'), (1070, 1080, 0.98, 'PF08484'), (1090, 1100, 0.8, 'PF02401'), (1100, 1110, 0.32, 'PF04369'), (1110, 1120, 1.0, 'PF00128')]: location = FeatureLocation(start, end) self.record.add_cds_feature( CDSFeature(location, locus_tag=str(start))) pfam = PFAMDomain(location, "dummy_description") pfam.db_xref.append(pfam_id) pfam.probability = probability self.record.add_pfam_domain(pfam)
def test_add_results_to_record(self): pfams = { 'PF00015.2': FeatureLocation(0, 3), 'PF00351.1': FeatureLocation(0, 3), 'PF00015.27': FeatureLocation(3, 6) } fake_record = set_dummy_with_pfams(pfams) fake_duplicate_pfam = PFAMDomain(location=FeatureLocation(6, 9), description='DUPLICATE', protein_start=0, protein_end=5) fake_duplicate_pfam.db_xref = ['PF00015.2'] fake_duplicate_pfam.domain_id = 'DUPLICATE' fake_record.add_pfam_domain(fake_duplicate_pfam) assert fake_duplicate_pfam in fake_record.get_pfam_domains() gos_for_fake_pfam = pfam2go.get_gos_for_pfams(fake_record) fake_results = pfam2go.Pfam2GoResults(fake_record.id, gos_for_fake_pfam) fake_results.add_to_record(fake_record) assert fake_duplicate_pfam.db_xref == ['PF00015.2'] for pfam in fake_record.get_pfam_domains(): assert sorted(pfam.gene_ontologies.ids) == sorted( fake_results.get_all_gos(pfam)) # make sure identical pfams (with different version numbers) all have the same gene ontologies for pfam_id in pfam.db_xref: if pfam_id.startswith('PF00015'): assert sorted(pfam.gene_ontologies.ids) == sorted( fake_results.get_all_gos(fake_duplicate_pfam))
def test_bad_pfam_domain(self): with self.assertRaisesRegex(TypeError, "PFAMDomain description must be a string"): PFAMDomain(FeatureLocation(2, 5), description=None, protein_start=5, protein_end=10) with self.assertRaisesRegex(TypeError, "Domain must be given domain as a string"): PFAMDomain(FeatureLocation(2, 5), description="desc", protein_start=5, protein_end=10, domain=5) with self.assertRaisesRegex(ValueError, "A PFAMDomain protein location cannot end before it starts"): PFAMDomain(FeatureLocation(2, 5), description="desc", protein_start=10, protein_end=5) with self.assertRaisesRegex(ValueError, "invalid literal for int()"): PFAMDomain(FeatureLocation(2, 5), description="desc", protein_start=10, protein_end="nope")
def set_dummy_with_pfams(pfam_ids: Dict[str, FeatureLocation]) -> DummyRecord: pfam_domains = [] for pfam_id, pfam_location in pfam_ids.items(): pfam_domain = PFAMDomain(location=pfam_location, description='FAKE', protein_start=0, protein_end=5) pfam_domain.db_xref = [pfam_id] pfam_domain.domain_id = '%s.%d.%d' % (pfam_id, pfam_location.start, pfam_location.end) pfam_domains.append(pfam_domain) return DummyRecord(features=pfam_domains)
def build_hits(record, hmmscan_results, min_score: float, max_evalue: float, database: str) -> List[Dict[str, Any]]: "Builds PFAMDomains from the given hmmscan results" logging.debug("Generating feature objects for PFAM hits") hits = [] feature_by_id = record.get_cds_name_mapping() for result in hmmscan_results: for hsp in result.hsps: if hsp.bitscore <= min_score or hsp.evalue >= max_evalue: continue if hsp.query_id not in hsp.query_id: continue feature = feature_by_id[hsp.query_id] start, end = calculate_start_and_end(feature, hsp) dummy_feature = PFAMDomain(FeatureLocation( start, end, feature.location.strand), description="") hit = { "start": start, "end": end, "strand": feature.location.strand, "label": result.id, "locus_tag": feature.locus_tag, "domain": hsp.hit_id, "evalue": hsp.evalue, "score": hsp.bitscore, "translation": str( dummy_feature.extract( record.seq).translate(table=feature.transl_table)), "db_xref": [pfamdb.get_pfam_id_from_name(hsp.hit_id, database)], "description": hsp.hit_description } hits.append(hit) return hits
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam_location = FeatureLocation(0, 12) fake_pfam = PFAMDomain(location=fake_pfam_location, description='MCPsignal', protein_start=0, protein_end=5) fake_pfam.domain_id = 'BLANK' blank_no_ids.add_pfam_domain(fake_pfam) with self.assertLogs(level='DEBUG') as log_cm: gos_for_no_pfams = pfam2go.get_gos_for_pfams(blank_no_pfams) assert 'No Pfam domains found' in str(log_cm.output) assert not gos_for_no_pfams gos_for_no_ids = pfam2go.get_gos_for_pfams(blank_no_ids) assert 'No Pfam ids found' in str(log_cm.output) assert not gos_for_no_ids
def add_to_record(self, record: Record) -> None: db_version = pfamdb.get_db_version_from_path(self.database) for i, hit in enumerate(self.hits): pfam_feature = PFAMDomain(FeatureLocation(hit["start"], hit["end"], hit["strand"]), description=hit["description"]) for key in [ "label", "locus_tag", "domain", "evalue", "score", "translation", "db_xref" ]: setattr(pfam_feature, key, hit[key]) pfam_feature.tool = self.tool pfam_feature.database = db_version pfam_feature.detection = "hmmscan" pfam_feature.domain_id = "{}_{}_{:04d}".format( self.tool, pfam_feature.locus_tag, i + 1) record.add_pfam_domain(pfam_feature)
def test_pfam_domain(self): original = PFAMDomain(FeatureLocation(2, 5), description="test", protein_start=5, protein_end=10, domain="p450") original.db_xref.append("test-ref") original.tool = "toolname" original.domain_id = "domain_id" original.database = "db" original.detection = "someprogram" original.evalue = 1e-5 original.score = 5. original.locus_tag = "locus" original.label = "somelabel" original.translation = "ARNDCQ" original.gene_ontologies = GOQualifier({'GO:0004871': 'signal transducer activity', 'GO:0007165': 'signal transduction', 'GO:0016020': 'membrane'}) new = PFAMDomain.from_biopython(original.to_biopython()[0]) for slot in ["db_xref", "tool", "domain_id", "database", "detection", "evalue", "score", "locus_tag", "label", "translation", "domain", "protein_start", "protein_end"]: assert getattr(original, slot) == getattr(new, slot) assert original.gene_ontologies.go_entries == new.gene_ontologies.go_entries