Ejemplo n.º 1
0
 def test_simple_after(self):
     cds = self.create_cds(60000, 63000, profiles=["right10k"])
     self.record.add_cds_feature(cds)
     assert utils.distance_to_pfam(self.record, self.query,
                                   ["notright10k"]) == -1
     assert utils.distance_to_pfam(self.record, self.query,
                                   ["right10k"]) == 10000
Ejemplo n.º 2
0
 def test_simple_before(self):
     cds = self.create_cds(29000, 30000, profiles=["left20k"])
     self.record.add_cds_feature(cds)
     assert utils.distance_to_pfam(self.record, self.query,
                                   ["notleft20k"]) == -1
     assert utils.distance_to_pfam(self.record, self.query,
                                   ["left20k"]) == 20000
Ejemplo n.º 3
0
    def test_edge_overlap_before(self):
        cds = self.create_cds(9000, 10000, profiles=["l.edge"])
        self.record.add_cds_feature(cds)
        assert utils.distance_to_pfam(self.record, self.query,
                                      ["l.edge"]) == -1

        cds.location = FeatureLocation(9000, 10001, strand=1)
        assert utils.distance_to_pfam(self.record, self.query,
                                      ["l.edge"]) == 39999

        cds.location = FeatureLocation(9000, 10001, strand=-1)
        assert utils.distance_to_pfam(self.record, self.query,
                                      ["l.edge"]) == 39999
Ejemplo n.º 4
0
 def test_outside_before(self):
     cds = self.create_cds(5000, 9999, profiles=["outside"])
     self.record.add_cds_feature(cds)
     assert utils.distance_to_pfam(self.record, self.query,
                                   ["outside"]) == -1
Ejemplo n.º 5
0
 def test_self_hit(self):
     assert utils.distance_to_pfam(self.record, self.query,
                                   ["query_gene_prof"]) == 0
Ejemplo n.º 6
0
 def test_empty_record(self):
     self.record._cds_features.clear()
     assert utils.distance_to_pfam(self.record, self.query, []) == -1
Ejemplo n.º 7
0
 def test_with_no_secmet(self):
     cds = self.create_cds(55000, 60000, profiles=[])
     cds.sec_met = SecMetQualifier()
     self.record.add_cds_feature(cds)
     assert utils.distance_to_pfam(self.record, self.query, ["test"]) == -1
Ejemplo n.º 8
0
def acquire_rodeo_heuristics(record: secmet.Record, query: secmet.CDSFeature,
                             leader: str, core: str,
                             domains: List[str]) -> Tuple[int, List[float]]:
    """ Calculate heuristic scores for RODEO

        Arguments:
            record: the record instance to analyse
            query: the feature being checked
            leader: the sequence of the peptide leader
            core: the sequence of the peptide core
            domains: the domains found within CDS features of the cluster

        Returns:
            a tuple of
                the RODEO score, and
                a list of floats for use in the RODEO SVM
    """
    tabs = []  # type: List[float]
    score = 0
    precursor = leader + core
    # Leader peptide contains FxLD motif
    if re.search('F.LD', leader):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core residue position of Sx4C motif
    match = re.search('S....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core residue position of Tx4C motif
    match = re.search('T....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core residue position of Sx5C motif
    match = re.search('S.....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core residue position of Tx5C motif
    match = re.search('T.....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Precursor is within 500 nt?
    hmmer_profiles = ['LANC_like', 'Lant_dehyd_C']
    distance = utils.distance_to_pfam(record, query, hmmer_profiles)
    if distance < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains LanB dehydratase domain (PF04738)
    if "Lant_dehyd_C" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains Lan C cyclase domain (PF05147)
    if "LANC_like" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster LACKS LanB dehydratase domain (PF04738)
    if "Lant_dehyd_C" not in domains:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster LACKS Lan C cyclase domain (PF05147)
    if "LANC_like" not in domains:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains LanB dehydratase elimination C-terminal domain (PF14028)
    if "PF14028" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains S8 peptidase subtilase (PF00082)
    if "Peptidase_S8" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains C39 peptidase (PF03412)
    if "Peptidase_C39" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains ABC transporter (PF00005)
    if "PF00005" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains YcaO-like protein (PF02624)
    if "YcaO" in domains:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains ThiF-like protein (PF00899)
    if "ThiF" in domains:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains PF02052 (Gallidermin)
    if set(domains).intersection(
        {"Gallidermin", "mature_a", "mature_b", "matura_ab"}):
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains PF8130
    if "Antimicr18" in domains:
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide mass < 4000 Da
    precursor_analysis = utils.RobustProteinAnalysis(precursor,
                                                     monoisotopic=True,
                                                     ignore_invalid=True)
    if precursor_analysis.molecular_weight() < 4000:
        score -= 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Core peptide mass < 2000 Da
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=True)
    if core_analysis.molecular_weight() < 2000:
        score -= 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide pHMMs below:
    precursor_hit = False
    # Precursor peptide hits gallidermin superfamily (cl03420) HMM
    if cds_has_domains(query, {"TIGR03731", "Gallidermin"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits lantibio_gallid (TIGR03731) HMM
    if cds_has_domains(query, {"TIGR03731"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits lanti_SCO0268 superfamily (cl22812) HMM
    if cds_has_domains(query, {"TIGR04451", "strep_PEQAXS"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits LD_lanti_pre (TIGR04363) HMM
    if cds_has_domains(query, {"LD_lanti_pre"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits Antimicrobial18 (cl06940) HMM
    if cds_has_domains(query, {"Antimicr18"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits gallidermin (PF02052) HMM
    if cds_has_domains(query,
                       {"Gallidermin", "mature_a", "mature_ab", "mature_b"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # precursor peptide hits Antimicrobial18 (PF08130) HMM
    if cds_has_domains(query, {"Antimicr18"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)

    if precursor_hit:
        score += 3

    # Precursor peptide mass (unmodified)
    precursor_analysis = utils.RobustProteinAnalysis(precursor,
                                                     monoisotopic=True,
                                                     ignore_invalid=False)
    tabs.append(float(precursor_analysis.molecular_weight()))

    # Unmodified leader peptide mass
    leader_analysis = utils.RobustProteinAnalysis(leader,
                                                  monoisotopic=True,
                                                  ignore_invalid=False)
    tabs.append(float(leader_analysis.molecular_weight()))

    # Unmodified core peptide mass
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))

    # Length of leader peptide
    tabs.append(len(leader))
    # Length of core peptide
    tabs.append(len(core))
    # Length of precursor peptide
    tabs.append(len(precursor))
    # Ratio of length of leader peptide / length of core peptide
    tabs.append(float(len(leader) / float(len(core))))
    # Core peptide ≥ 35 residues
    if len(core) >= 35:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core peptide contains CC motif (not in last 3 residues)
    if 'CC' in core[:-3]:
        score -= 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader peptide has > 4 negatively charge motifs
    if sum([leader.count(aa) for aa in "DE"]) > 4:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader peptide has net negative charge
    charge_dict = {"E": -1, "D": -1, "K": 1, "R": 1}
    if sum([charge_dict[aa] for aa in leader if aa in charge_dict]) < 0:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader residue position of FxLD motif
    match = re.search('F.LD', leader)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core peptide contains C-terminal CC (within last 3 residues)
    if 'CC' in core[-3:]:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core peptide contains DGCGxTC / SFNS / SxxLC / CTxGC / TPGC / SFNSxC motifs
    motifs = (('DGCG.TC', 2), ('SFNS', 2), ('S..LC', 2), ('CT.GC', 1),
              ('TPGC', 1), ('SFNS.C', 1))
    for motif, motif_score in motifs:
        if re.search(motif, core):
            score += motif_score
            tabs.append(1)
        else:
            tabs.append(0)
    # Core peptide contains < 2 or < 3 Cys
    if core.count("C") < 2:
        score -= 6
        tabs += [1, 1]
    elif core.count("C") < 3:
        score -= 3
        tabs += [1, 0]
    else:
        tabs += [0, 0]
    # No Cys/Ser/Thr in core peptide
    for amino, penalty in [("C", -10), ("S", -4), ("T", -4)]:
        if amino not in core:
            score += penalty
            tabs.append(1)
        else:
            tabs.append(0)
    # Lanthionine regex maximum ring number > 4
    numrings, profile = lanscout(core)
    if numrings > 4:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Lanthionine regex maximum ring number < 3
    if numrings < 3:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Lanthionine regex 4-membered ring/5-membered ring/6-membered ring/7-membered ring/8-membered ring
    scores = [2, 2, 2, 2, 1]
    scorepos = 0
    for ringsize in profile[:2]:
        if ringsize not in [0, 1, 2]:
            score += scores[scorepos]
            tabs.append(1)
        else:
            tabs.append(0)
        scorepos += 1
    for ringsize in profile[2:]:
        if ringsize != 0:
            score += scores[scorepos]
            tabs.append(1)
        else:
            tabs.append(0)
        scorepos += 1
    return score, tabs
Ejemplo n.º 9
0
def acquire_rodeo_heuristics(
        cluster: secmet.Protocluster, query: secmet.CDSFeature, leader: str,
        core: str, domains: Dict[str,
                                 int]) -> Tuple[int, List[float], List[int]]:
    """Calculate heuristic scores for RODEO"""
    tabs = []
    score = 0
    precursor = leader + core
    # Calcd. precursor peptide mass (Da)
    precursor_analysis = utils.RobustProteinAnalysis(precursor,
                                                     monoisotopic=True,
                                                     ignore_invalid=False)
    tabs.append(float(precursor_analysis.molecular_weight()))
    # Calcd. leader peptide mass (Da)
    leader_analysis = utils.RobustProteinAnalysis(leader,
                                                  monoisotopic=True,
                                                  ignore_invalid=False)
    tabs.append(float(leader_analysis.molecular_weight()))
    # Calcd. core peptide mass (Da)
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))
    # Distance to any biosynthetic protein (E, B, C)
    hmmer_profiles = ['PF04055']
    distance = utils.distance_to_pfam(cluster.parent_record, query,
                                      hmmer_profiles)
    tabs.append(distance)
    # rSAM within 500 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # rSAM within 150 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 150:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # rSAM further than 1000 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) == -1 or \
       utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) > 10000:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Ratio of N-term to 1st Cys 0.25<x<0.60; Ratio of N-term to 1st Cys <0.25 or >0.60
    if "C" not in precursor:
        score -= 2
        tabs += [0, 1]
    elif 0.25 <= precursor.find("C") / len(precursor) <= 0.60:
        score += 2
        tabs += [1, 0]
    else:
        score -= 2
        tabs += [0, 1]
    # Three or more Cys; Less than 3 Cys
    if precursor.count("C") >= 3:
        score += 4
        tabs += [1, 0]
    else:
        score -= 4
        tabs += [0, 1]
    # CxC/CxxC/CxxxC/CxxxxxC; # CC/CCC
    motifs = (('C.{5}C', 2), ('C.{3}C', 1), ('C.{2}C', 1), ('C.{1}C', 1),
              ('CC', -2), ('CCC', -2))
    for motif in motifs:
        if re.search(motif[0], core):
            score += motif[1]
            tabs.append(1)
        else:
            tabs.append(0)
    # No Cys in last 1/4th?
    quarter_length = -len(precursor) // 4
    if "C" not in precursor[quarter_length:]:
        score += 1
        tabs.append(1)
    else:
        score -= 1
        tabs.append(0)
    # 2 Cys in first 2/3rds of precursor, 1 Cys in last 1/3rd of precursor
    two_thirds = 2 * len(precursor) // 3
    if precursor[:two_thirds].count("C") == 2 and precursor[two_thirds:].count(
            "C") == 1:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SboA hmm
    if cds_has_domains(query, {"Subtilosin_A"}):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SkfA hmm
    if cds_has_domains(query, {"TIGR04404"}):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SCIFF hmm
    if cds_has_domains(query, {"TIGR03973"}):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has PqqD/RRE (PF05402)
    if "PF05402" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has SPASM domain (PF13186)
    if "PF13186" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # PF04055 (rSAM) domain start > 80
    runresults = subprocessing.run_hmmsearch(
        path.get_full_path(__file__, "data", "PF04055.hmm"),
        fasta.get_fasta_from_features(cluster.cds_children))
    max_start = 0
    hitstarts = []
    hitends = []
    for runresult in runresults:
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.bitscore > 40:
                hitstarts.append(hsp.hit_start)
                max_start = max(hsp.hit_start, max_start)
                hitends.append(hsp.hit_end)
    if hitstarts and max_start > 80:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has peptidase
    peptidase_domains = [
        "Peptidase_M16_C", "Peptidase_S8", "Peptidase_M16", "Peptidase_S41"
    ]
    no_peptidase = True
    for pepdom in peptidase_domains:
        if pepdom in domains:
            score += 1
            tabs.append(1)
            no_peptidase = False
        else:
            tabs.append(0)
    # cluster has transporter
    transport_domains = ["PF00005", "PF00664"]
    for transpdom in transport_domains:
        if transpdom in domains:
            score += 1
            tabs.append(1)
        else:
            tabs.append(0)
    # cluster has response regulator (PF00072)
    if "PF00072" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has major facilitator (PF07690)
    if "PF07690" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has ATPase (PF13304)
    if "PF13304" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has Fer4_12 (PF13353)
    if "PF13353" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has rSAM (PF04055)
    if "PF04055" in domains or "TIGR03975" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has no recognized peptidase
    if no_peptidase:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # C-terminal portion is < 0.35 or > 0.65; C-terminal portion is defined as
    # the part from the last cysteine in the last identified Cx(n)C motif to the C-terminus
    # the binary opposite is also included as the next field
    last_motif_c = 0
    index = -1
    for aa in reversed(precursor):
        if aa == "C" and "C" in precursor[index - 6:index]:
            last_motif_c = index + 1
        index -= 1
    if 0.35 <= last_motif_c / len(precursor) <= 0.65:
        score += 3
        tabs += [0, 1]
    else:
        score -= 2
        tabs += [1, 0]
    # SS profile count > 1
    # is there more than one Cx..C structure in the sequence
    cysrex = '(?=(C.{%d,%d}C))' % (CHAIN_LOWER, CHAIN_UPPER)
    rex4 = re.compile(cysrex)
    if len(rex4.findall(core)) > 1:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    return score, tabs, hitends
Ejemplo n.º 10
0
def generate_rodeo_svm_csv(
        record: Record, query: CDSFeature, leader: str, core: str,
        previously_gathered_tabs: List[Union[float,
                                             int]], fimo_motifs: List[int],
        fimo_scores: Dict[int, float]) -> List[Union[float, int]]:
    """Generates all the items for a single precursor peptide candidate"""
    columns = []  # type: List[Union[float, int]]
    # Precursor Index
    columns.append(1)
    # classification
    columns.append(0)
    columns += previously_gathered_tabs
    # Cluster has PF00733?
    if utils.distance_to_pfam(record, query, ['PF00733']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF00733']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Cluster has PF05402?
    if utils.distance_to_pfam(record, query, ['PF05402']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF05402']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Cluster has PF13471?
    if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF13471']) > 10000:
        columns.append(0)
    else:
        columns.append(1)
    # Leader has LxxxxxT motif?
    if re.search('(L[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader):
        columns.append(1)
    else:
        columns.append(0)
    # Core has adjacent identical aas (doubles)?
    if any(core[i] == core[i + 1] for i in range(len(core) - 1)):
        columns.append(1)
    else:
        columns.append(0)
    # Core length (aa)
    columns.append(len(core))
    # Leader length (aa)
    columns.append(len(leader))
    # Precursor length (aa)
    columns.append(len(leader) + len(core))
    # Leader/core ratio
    columns.append(len(core) / len(leader))
    # Number of Pro in first 9 aa of core?
    columns.append(core[:9].count("P"))
    # Estimated core charge
    charge_dict = {"E": -1, "D": -1, "K": 1, "H": 1, "R": 1}
    columns.append(sum([charge_dict[aa] for aa in core if aa in charge_dict]))
    # Estimated leader charge
    columns.append(sum([charge_dict[aa] for aa in leader
                        if aa in charge_dict]))
    # Estimated precursor charge
    columns.append(
        sum([charge_dict[aa] for aa in leader + core if aa in charge_dict]))
    # Absolute value of core charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in core if aa in charge_dict])))
    # Absolute value of leader charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in leader if aa in charge_dict])))
    # Absolute value of precursor charge
    columns.append(
        abs(sum([charge_dict[aa] for aa in leader + core
                 if aa in charge_dict])))
    # Counts of AAs in leader
    columns += [leader.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in leader
    columns.append(sum([leader.count(aa) for aa in "FWY"]))
    # Neg charged in leader
    columns.append(sum([leader.count(aa) for aa in "DE"]))
    # Pos charged in leader
    columns.append(sum([leader.count(aa) for aa in "RK"]))
    # Charged in leader
    columns.append(sum([leader.count(aa) for aa in "RKDE"]))
    # Aliphatic in leader
    columns.append(sum([leader.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in leader
    columns.append(sum([leader.count(aa) for aa in "ST"]))
    # Counts of AAs in core
    columns += [core.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in core
    columns.append(sum([core.count(aa) for aa in "FWY"]))
    # Neg charged in core
    columns.append(sum([core.count(aa) for aa in "DE"]))
    # Pos charged in core
    columns.append(sum([core.count(aa) for aa in "RK"]))
    # Charged in core
    columns.append(sum([core.count(aa) for aa in "RKDE"]))
    # Aliphatic in core
    columns.append(sum([core.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in core
    columns.append(sum([core.count(aa) for aa in "ST"]))
    # Counts (0 or 1) of amino acids within first AA position of core sequence
    columns += [core[0].count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Counts of AAs in leader+core
    precursor = leader + core
    columns += [precursor.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"
                ]  # Temp to work with current training CSV
    # Aromatics in precursor
    columns.append(sum([precursor.count(aa) for aa in "FWY"]))
    # Neg charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "DE"]))
    # Pos charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RK"]))
    # Charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RKDE"]))
    # Aliphatic in precursor
    columns.append(sum([precursor.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in precursor
    columns.append(sum([precursor.count(aa) for aa in "ST"]))
    # Motifs
    columns += [1 if motif in fimo_motifs else 0 for motif in range(1, 17)]
    # Total motifs hit
    columns.append(len(fimo_motifs))
    # Motif scores
    columns += [
        fimo_scores[motif] if motif in fimo_motifs else 0
        for motif in range(1, 17)
    ]
    # Sum of MEME scores
    columns.append(
        sum([
            fimo_scores[motif] if motif in fimo_motifs else 0
            for motif in range(1, 17)
        ]))
    # No Motifs?
    if not fimo_motifs:
        columns.append(1)
    else:
        columns.append(0)
    # Alternate Start Codon?
    if not str(query.extract(record.seq)).startswith("ATG"):
        columns.append(1)
    else:
        columns.append(0)
    return columns
Ejemplo n.º 11
0
def acquire_rodeo_heuristics(record: Record, cluster: Cluster,
                             query: CDSFeature, leader: str,
                             core: str) -> Tuple[int, List[Union[float, int]]]:
    """Calculate heuristic scores for RODEO"""
    tabs = []  # type: List[Union[float, int]]
    score = 0
    # Calcd. lasso peptide mass (Da) (with Xs average out)
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))

    # Distance to any biosynthetic protein (E, B, C)
    hmmer_profiles = ['PF13471', 'PF00733', 'PF05402']
    distance = utils.distance_to_pfam(record, query, hmmer_profiles)
    tabs.append(distance)
    # Within 500 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Within 150 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 150:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Greater than 1000 nucleotides from every biosynthetic protein (E, B, C)	-2
    if distance > 1000:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core region has 2 or 4 Cys residues	+1
    if core.count("C") in [2, 4]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region is longer than core region	+2
    if len(leader) > len(core):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has 7 (Glu) or 8(Glu/Asp) or 9 (Asp) membered ring possible	+1
    if 'E' in core[6:8] or 'D' in core[7:9]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains GxxxxxT	+3
    if re.search('(G[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with G	+2
    if core.startswith("G"):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide and lasso cyclase are on same strand	+1
    if is_on_same_strand_as(cluster, query, 'PF00733'):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader/core region length ratio < 2 and > 0.5	+1
    if 0.5 <= len(leader) / len(core) <= 2:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with Cys and has an even number of Cys	0
    if core.startswith("C") and core.count("C") % 2 == 0:
        score += 0
        tabs.append(1)
    else:
        tabs.append(0)
    # Core contains no Gly	-4
    if "G" not in core:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least one aromatic residue	+1
    if set("FWY") & set(core):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least 2 aromatic residues	+2
    if sum([core.count(aa) for aa in list("FWY")]) >= 2:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has odd number of Cys	-2
    if core.count("C") % 2 != 0:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Trp	-1
    if "W" in leader:
        score -= 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Lys	+1
    if "K" in leader:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region has Cys	-2
    if "C" in leader:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Gene cluster does not contain PF13471	-2
    if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF13471']) > 10000:
        score -= 2
    # Peptide utilizes alternate start codon	-1
    if not str(query.extract(record.seq)).startswith("ATG"):
        score -= 1
    return score, tabs