Ejemplo n.º 1
0
    def setUp(self):
        self.config = build_config([
            "--cf-create-clusters", "--cf-mean-threshold", "0.6",
            "--cf-min-cds", "5", "--cf-min-pfams", "5"
        ],
                                   modules=[clusterfinder],
                                   isolated=True)
        update_config({"enabled_cluster_types": []})

        self.record = DummyRecord(seq=Seq("A" * 2000))
        for start, end, probability, pfam_id in [(10, 20, 0.1, 'FAKE007'),
                                                 (30, 40, 0.3, 'PF00106'),
                                                 (50, 60, 0.4, 'PF00107'),
                                                 (60, 70, 0.7, 'PF00109'),
                                                 (70, 80, 0.98, 'PF08484'),
                                                 (90, 100, 0.8, 'PF02401'),
                                                 (100, 110, 0.32, 'PF04369'),
                                                 (110, 120, 1.0, 'PF00128'),
                                                 (130, 140, 0.2, 'FAKE234'),
                                                 (500, 505, None, 'FAKE505'),
                                                 (1010, 1020, 0.1, 'FAKE007'),
                                                 (1030, 1040, 0.3, 'PF00106'),
                                                 (1050, 1060, 0.4, 'PF00107'),
                                                 (1060, 1070, 0.7, 'PF00109'),
                                                 (1070, 1080, 0.98, 'PF08484'),
                                                 (1090, 1100, 0.8, 'PF02401'),
                                                 (1100, 1110, 0.32, 'PF04369'),
                                                 (1110, 1120, 1.0, 'PF00128')]:
            location = FeatureLocation(start, end)
            self.record.add_cds_feature(
                CDSFeature(location, locus_tag=str(start)))
            pfam = PFAMDomain(location, "dummy_description")
            pfam.db_xref.append(pfam_id)
            pfam.probability = probability
            self.record.add_pfam_domain(pfam)
Ejemplo n.º 2
0
 def test_add_results_to_record(self):
     pfams = {
         'PF00015.2': FeatureLocation(0, 3),
         'PF00351.1': FeatureLocation(0, 3),
         'PF00015.27': FeatureLocation(3, 6)
     }
     fake_record = set_dummy_with_pfams(pfams)
     fake_duplicate_pfam = PFAMDomain(location=FeatureLocation(6, 9),
                                      description='DUPLICATE',
                                      protein_start=0,
                                      protein_end=5)
     fake_duplicate_pfam.db_xref = ['PF00015.2']
     fake_duplicate_pfam.domain_id = 'DUPLICATE'
     fake_record.add_pfam_domain(fake_duplicate_pfam)
     assert fake_duplicate_pfam in fake_record.get_pfam_domains()
     gos_for_fake_pfam = pfam2go.get_gos_for_pfams(fake_record)
     fake_results = pfam2go.Pfam2GoResults(fake_record.id,
                                           gos_for_fake_pfam)
     fake_results.add_to_record(fake_record)
     assert fake_duplicate_pfam.db_xref == ['PF00015.2']
     for pfam in fake_record.get_pfam_domains():
         assert sorted(pfam.gene_ontologies.ids) == sorted(
             fake_results.get_all_gos(pfam))
         # make sure identical pfams (with different version numbers) all have the same gene ontologies
         for pfam_id in pfam.db_xref:
             if pfam_id.startswith('PF00015'):
                 assert sorted(pfam.gene_ontologies.ids) == sorted(
                     fake_results.get_all_gos(fake_duplicate_pfam))
Ejemplo n.º 3
0
 def test_bad_pfam_domain(self):
     with self.assertRaisesRegex(TypeError, "PFAMDomain description must be a string"):
         PFAMDomain(FeatureLocation(2, 5), description=None, protein_start=5, protein_end=10)
     with self.assertRaisesRegex(TypeError, "Domain must be given domain as a string"):
         PFAMDomain(FeatureLocation(2, 5), description="desc", protein_start=5, protein_end=10, domain=5)
     with self.assertRaisesRegex(ValueError, "A PFAMDomain protein location cannot end before it starts"):
         PFAMDomain(FeatureLocation(2, 5), description="desc", protein_start=10, protein_end=5)
     with self.assertRaisesRegex(ValueError, "invalid literal for int()"):
         PFAMDomain(FeatureLocation(2, 5), description="desc", protein_start=10, protein_end="nope")
Ejemplo n.º 4
0
def set_dummy_with_pfams(pfam_ids: Dict[str, FeatureLocation]) -> DummyRecord:
    pfam_domains = []
    for pfam_id, pfam_location in pfam_ids.items():
        pfam_domain = PFAMDomain(location=pfam_location,
                                 description='FAKE',
                                 protein_start=0,
                                 protein_end=5)
        pfam_domain.db_xref = [pfam_id]
        pfam_domain.domain_id = '%s.%d.%d' % (pfam_id, pfam_location.start,
                                              pfam_location.end)
        pfam_domains.append(pfam_domain)
    return DummyRecord(features=pfam_domains)
Ejemplo n.º 5
0
def build_hits(record, hmmscan_results, min_score: float, max_evalue: float,
               database: str) -> List[Dict[str, Any]]:
    "Builds PFAMDomains from the given hmmscan results"
    logging.debug("Generating feature objects for PFAM hits")

    hits = []
    feature_by_id = record.get_cds_name_mapping()

    for result in hmmscan_results:
        for hsp in result.hsps:
            if hsp.bitscore <= min_score or hsp.evalue >= max_evalue:
                continue

            if hsp.query_id not in hsp.query_id:
                continue

            feature = feature_by_id[hsp.query_id]

            start, end = calculate_start_and_end(feature, hsp)

            dummy_feature = PFAMDomain(FeatureLocation(
                start, end, feature.location.strand),
                                       description="")

            hit = {
                "start":
                start,
                "end":
                end,
                "strand":
                feature.location.strand,
                "label":
                result.id,
                "locus_tag":
                feature.locus_tag,
                "domain":
                hsp.hit_id,
                "evalue":
                hsp.evalue,
                "score":
                hsp.bitscore,
                "translation":
                str(
                    dummy_feature.extract(
                        record.seq).translate(table=feature.transl_table)),
                "db_xref":
                [pfamdb.get_pfam_id_from_name(hsp.hit_id, database)],
                "description":
                hsp.hit_description
            }
            hits.append(hit)
    return hits
Ejemplo n.º 6
0
 def test_blank_records(self):
     blank_no_pfams = DummyRecord()
     blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna))
     fake_pfam_location = FeatureLocation(0, 12)
     fake_pfam = PFAMDomain(location=fake_pfam_location,
                            description='MCPsignal',
                            protein_start=0,
                            protein_end=5)
     fake_pfam.domain_id = 'BLANK'
     blank_no_ids.add_pfam_domain(fake_pfam)
     with self.assertLogs(level='DEBUG') as log_cm:
         gos_for_no_pfams = pfam2go.get_gos_for_pfams(blank_no_pfams)
         assert 'No Pfam domains found' in str(log_cm.output)
         assert not gos_for_no_pfams
         gos_for_no_ids = pfam2go.get_gos_for_pfams(blank_no_ids)
         assert 'No Pfam ids found' in str(log_cm.output)
         assert not gos_for_no_ids
Ejemplo n.º 7
0
 def add_to_record(self, record: Record) -> None:
     db_version = pfamdb.get_db_version_from_path(self.database)
     for i, hit in enumerate(self.hits):
         pfam_feature = PFAMDomain(FeatureLocation(hit["start"], hit["end"],
                                                   hit["strand"]),
                                   description=hit["description"])
         for key in [
                 "label", "locus_tag", "domain", "evalue", "score",
                 "translation", "db_xref"
         ]:
             setattr(pfam_feature, key, hit[key])
         pfam_feature.tool = self.tool
         pfam_feature.database = db_version
         pfam_feature.detection = "hmmscan"
         pfam_feature.domain_id = "{}_{}_{:04d}".format(
             self.tool, pfam_feature.locus_tag, i + 1)
         record.add_pfam_domain(pfam_feature)
Ejemplo n.º 8
0
 def test_pfam_domain(self):
     original = PFAMDomain(FeatureLocation(2, 5), description="test",
                           protein_start=5, protein_end=10,
                           domain="p450")
     original.db_xref.append("test-ref")
     original.tool = "toolname"
     original.domain_id = "domain_id"
     original.database = "db"
     original.detection = "someprogram"
     original.evalue = 1e-5
     original.score = 5.
     original.locus_tag = "locus"
     original.label = "somelabel"
     original.translation = "ARNDCQ"
     original.gene_ontologies = GOQualifier({'GO:0004871': 'signal transducer activity',
                                             'GO:0007165': 'signal transduction',
                                             'GO:0016020': 'membrane'})
     new = PFAMDomain.from_biopython(original.to_biopython()[0])
     for slot in ["db_xref", "tool", "domain_id", "database", "detection",
                  "evalue", "score", "locus_tag", "label", "translation", "domain",
                  "protein_start", "protein_end"]:
         assert getattr(original, slot) == getattr(new, slot)
     assert original.gene_ontologies.go_entries == new.gene_ontologies.go_entries