コード例 #1
0
    def _parse_alignment_line(
            line: str) -> Tuple[str, str, int, str, int, int, Optional[int]]:
        sline = MULTISPACE_REGEX.split(line.strip(), maxsplit=5)

        columns = ["type", "id", "ali_start", "sequence", "ali_end", "length"]
        dline = dict(zip(columns, sline))

        length = fmap(lambda x: x.lstrip("(").rstrip(")"),
                      dline.get("length", None))

        if length is None:
            raise LineParseError(
                f"Missing 'length' from alignment line: '{line}'.")

        seq_begin_match = ALI_REGEX.match(line)
        if seq_begin_match is None:
            seq_begin: Optional[int] = None
        else:
            seq_begin = seq_begin_match.end()

        return (get_and_parse("type", "type", is_one_of(["T", "Q"]))(dline),
                get_and_parse("id", "id", parse_str)(dline),
                get_and_parse("ali_start", "ali_start", parse_int)(dline),
                get_and_parse("sequence", "sequence", parse_str)(dline),
                get_and_parse("ali_end", "ali_end", parse_int)(dline),
                raise_it(parse_field(parse_int, "length",
                                     "field"))(length), seq_begin)
コード例 #2
0
    def _parse_probab_line(
        field: str
    ) -> Tuple[float, float, float, int, float, float, float, Optional[float]]:
        sline = (s for s in MULTISPACE_REGEX.split(field.strip()))
        columns = [
            "Probab",
            "E-value",
            "Score",
            "Aligned_cols",
            "Identities",
            "Similarity",
            "Sum_probs",
            "Template_Neff",
        ]

        dline = {
            col: raise_it(parse_field(split_at_eq(parse_str, col), col))(f)
            for f, col in zip(sline, columns)
        }

        if "Template_Neff" in dline:
            template_neff: Optional[float] = raise_it(
                parse_field(parse_float,
                            "template_neff"))(dline["Template_Neff"])
        else:
            template_neff = None

        return (
            get_and_parse("Probab", "probability", parse_float)(dline),
            get_and_parse("E-value", "evalue", parse_float)(dline),
            get_and_parse("Score", "score", parse_float)(dline),
            get_and_parse("Aligned_cols", "aligned_cols", parse_int)(dline),
            get_and_parse("Identities", "identity",
                          lambda x: parse_float(x.rstrip("%")))(dline) / 100.0,
            get_and_parse("Similarity", "similarity", parse_float)(dline),
            get_and_parse("Sum_probs", "sum_probs", parse_float)(dline),
            template_neff,
        )
コード例 #3
0
    def from_line(cls, line: str) -> "LOCALIZER":
        """ Parse an ApoplastP line as an object. """

        if line == "":
            raise LineParseError("The line was empty.")

        sline = [c.strip() for c in line.strip().split("\t")]

        if len(sline) != 4:
            raise LineParseError("The line had the wrong number of columns. "
                                 f"Expected 4 but got {len(sline)}")

        (cp, cp_prob, cp_start,
         cp_end) = parse_tp_field(sline[1], "chloroplast")

        (mt, mt_prob, mt_start,
         mt_end) = parse_tp_field(sline[2], "mitochondria")

        (nuc, nuc_sigs) = parse_nuc_field(sline[3])

        return cls(
            raise_it(parse_field(parse_str, "name"))(sline[0]), cp, cp_prob,
            fmap(lambda x: x - 1 + 20, cp_start), cp_end, mt, mt_prob,
            fmap(lambda x: x - 1 + 20, mt_start), mt_end, nuc, nuc_sigs)
コード例 #4
0
    field_name: str = "active_site",
) -> str:
    """ """

    field = field.strip()
    if not field.startswith("predicted_active_site"):
        raise LineParseError(
            f"Invalid value: '{field}' in the column: '{field_name}'. "
            "Must have the form 'predicted_active_site[1,2,3]'.")

    field = field[len("predicted_active_site"):]
    sfield = (f.strip("[],; ") for f in field.split('['))
    return ';'.join(f.replace(' ', '') for f in sfield if len(f) > 0)


ps_name = raise_it(parse_field(parse_str, "name"))
ps_ali_start = raise_it(parse_field(parse_int, "ali_start"))
ps_ali_end = raise_it(parse_field(parse_int, "ali_end"))
ps_env_start = raise_it(parse_field(parse_int, "env_start"))
ps_env_end = raise_it(parse_field(parse_int, "env_end"))
ps_hmm = raise_it(parse_field(parse_str, "hmm"))
ps_hmm_name = raise_it(parse_field(parse_str, "hmm_name"))
ps_hmm_type = raise_it(parse_field(parse_str, "hmm_type"))
ps_hmm_start = raise_it(parse_field(parse_int, "hmm_start"))
ps_hmm_end = raise_it(parse_field(parse_int, "hmm_end"))
ps_hmm_len = raise_it(parse_field(parse_int, "hmm_len"))
ps_bitscore = raise_it(parse_field(parse_float, "bitscore"))
ps_evalue = raise_it(parse_field(parse_float, "evalue"))
ps_is_significant = raise_it(
    parse_field(parse_bool("1", "0"), "is_significant"))
コード例 #5
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_str, parse_float,
                                    is_one_of)

__all__ = ["ApoplastP"]

apo_name = raise_it(parse_field(parse_str, "name"))
apo_prediction = raise_it(
    parse_field(is_one_of(["Apoplastic", "Non-apoplastic"]), "prediction"))
apo_prob = raise_it(parse_field(parse_float, "prob"))


class ApoplastP(Analysis):
    """     """

    columns = ["name", "prediction", "prob"]
    types = [str, str, float]
    analysis = "apoplastp"
    software = "ApoplastP"

    def __init__(self, name: str, prediction: str, prob: float) -> None:
        self.name = name
        self.prediction = prediction
        self.prob = prob
        return
コード例 #6
0
ファイル: gff.py プロジェクト: ccdmb/predector-utils
GFF3_WRITE_ORDER: List[str] = [
    "ID",
    "Name",
    "Alias",
    "Parent",
    "Target",
    "Gap",
    "Derives_from",
    "Note",
    "Dbxref",
    "Ontology_term",
    "Is_circular",
]

rec_seqid = raise_it(parse_field(parse_str, "seqid"))
rec_source = raise_it(parse_field(parse_str, "source"))
rec_type = raise_it(parse_field(parse_str, "type"))
rec_start = raise_it(parse_field(parse_int, "start"))
rec_end = raise_it(parse_field(parse_int, "end"))
rec_score = raise_it(parse_field(parse_or_none(parse_float, "."), "score"))
rec_strand = raise_it(parse_field(is_one_of(["-", "+", ".", "?"]), "strand"))
rec_phase = raise_it(parse_field(is_one_of(["0", "1", "2", "."]), "phase"))


def parse_attr_list(string: str) -> List[str]:
    return list(f.strip() for f in string.strip(", ").split(","))


attr_is_circular = raise_it(
    parse_field(
コード例 #7
0
ファイル: deeploc.py プロジェクト: ccdmb/predector-utils
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_float, is_one_of)

dl_name = raise_it(parse_field(parse_str, "name"))
dl_prediction = raise_it(
    parse_field(
        is_one_of([
            "Membrane", "Nucleus", "Cytoplasm", "Extracellular",
            "Mitochondrion", "Cell_membrane", "Endoplasmic_reticulum",
            "Plastid", "Golgi_apparatus", "Lysosome/Vacuole", "Peroxisome"
        ]), "prediction"))
dl_membrane = raise_it(parse_field(parse_float, "membrane"))
dl_nucleus = raise_it(parse_field(parse_float, "nucleus"))
dl_cytoplasm = raise_it(parse_field(parse_float, "cytoplasm"))
dl_extracellular = raise_it(parse_field(parse_float, "extracellular"))
dl_mitochondrion = raise_it(parse_field(parse_float, "mitochondrion"))
dl_cell_membrane = raise_it(parse_field(parse_float, "cell_membrane"))
dl_endoplasmic_reticulum = raise_it(
    parse_field(parse_float, "endoplasmic_reticulum"))
dl_plastid = raise_it(parse_field(parse_float, "plastid"))
dl_golgi_apparatus = raise_it(parse_field(parse_float, "golgi_apparatus"))
dl_lysosome = raise_it(parse_field(parse_float, "lysosome_vacuole"))
dl_peroxisome = raise_it(parse_field(parse_float, "peroxisome"))
コード例 #8
0
 def _parse_query_length_line(field: str) -> int:
     return raise_it(
         parse_field(
             split_at_multispace(parse_int, "Match_columns"),
             "query_length",
         ))(field)
コード例 #9
0
ファイル: mmseqs.py プロジェクト: ccdmb/predector-utils
from typing import Optional

from predectorutils.gff import (GFFRecord, GFFAttributes, Strand, Target, Gap,
                                GapCode, GapElement)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    parse_field,
    raise_it,
    parse_str,
    parse_float,
    parse_int,
)

mm_query = raise_it(parse_field(parse_str, "query"))
mm_target = raise_it(parse_field(parse_str, "target"))
mm_qstart = raise_it(parse_field(parse_int, "qstart"))
mm_qend = raise_it(parse_field(parse_int, "qend"))
mm_qlen = raise_it(parse_field(parse_int, "qlen"))
mm_tstart = raise_it(parse_field(parse_int, "tstart"))
mm_tend = raise_it(parse_field(parse_int, "tend"))
mm_tlen = raise_it(parse_field(parse_int, "tlen"))
mm_evalue = raise_it(parse_field(parse_float, "evalue"))
mm_gapopen = raise_it(parse_field(parse_int, "gapopen"))
mm_pident = raise_it(parse_field(parse_float, "pident"))
mm_alnlen = raise_it(parse_field(parse_int, "alnlen"))
mm_raw = raise_it(parse_field(parse_float, "raw"))
mm_bits = raise_it(parse_field(parse_float, "bits"))
mm_cigar = raise_it(parse_field(parse_str, "cigar"))
mm_mismatch = raise_it(parse_field(parse_int, "mismatch"))
コード例 #10
0
from predectorutils.gff import (
    GFFRecord,
    GFFAttributes,
    Strand,
)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.analyses.base import str_or_none
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_str, parse_float,
                                    parse_int, parse_bool, parse_regex,
                                    MULTISPACE_REGEX, is_one_of, is_value)

__all__ = ["SignalP3NN", "SignalP3HMM", "SignalP4", "SignalP5"]

s3nn_name = raise_it(parse_field(parse_str, "name"))
s3nn_cmax = raise_it(parse_field(parse_float, "cmax"))
s3nn_cmax_pos = raise_it(parse_field(parse_int, "cmax_pos"))
s3nn_cmax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "cmax_decision"))
s3nn_ymax = raise_it(parse_field(parse_float, "ymax"))
s3nn_ymax_pos = raise_it(parse_field(parse_int, "ymax_pos"))
s3nn_ymax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "ymax_decision"))
s3nn_smax = raise_it(parse_field(parse_float, "smax"))
s3nn_smax_pos = raise_it(parse_field(parse_int, "smax_pos"))
s3nn_smax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "smax_decision"))
s3nn_smean = raise_it(parse_field(parse_float, "smean"))
s3nn_smean_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "smean_decision"))
コード例 #11
0
from typing import Optional
from typing import TextIO
from typing import Iterator

from predectorutils.gff import (
    GFFRecord,
    GFFAttributes,
    Strand,
)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.analyses.base import float_or_none, str_or_none
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_regex, parse_float, is_one_of)

tp_name = raise_it(parse_field(parse_str, "name"))
tp_prediction = raise_it(
    parse_field(is_one_of(["OTHER", "noTP", "SP", "mTP", "cTP", "luTP"]),
                "prediction"))
tp_other = raise_it(parse_field(parse_float, "OTHER"))
tp_sp = raise_it(parse_field(parse_float, "SP"))
tp_mtp = raise_it(parse_field(parse_float, "mTP"))

pl_prediction = raise_it(
    parse_field(is_one_of(["OTHER", "SP", "mTP", "cTP", "luTP"]),
                "prediction"))
pl_ctp = raise_it(parse_field(parse_float, "cTP"))
pl_lutp = raise_it(parse_field(parse_float, "luTP"))

CS_POS_REGEX = re.compile(r"CS\s+pos:\s+\d+-(?P<cs>\d+)\.?\s+"
                          r"[A-Za-z]+-[A-Za-z]+\.?\s+"
コード例 #12
0
ファイル: hmmer.py プロジェクト: ccdmb/predector-utils
    parse_str,
    parse_float,
    parse_int,
    MULTISPACE_REGEX,
)


def split_hmm(s: str) -> Union[ValueParseError, str]:
    s1 = parse_str(s)
    if isinstance(s1, ValueParseError):
        return s1
    else:
        return s.rsplit(".hmm", maxsplit=1)[0]


hm_name = raise_it(parse_field(parse_str, "name"))  # query name
hm_hmm = raise_it(parse_field(split_hmm, "hmm"))  # target name
hm_hmm_len = raise_it(parse_field(parse_int, "hmm_len"))  # tlen
hm_query_len = raise_it(parse_field(parse_int, "query_len"))  # qlen
hm_full_evalue = raise_it(parse_field(parse_float, "full_evalue"))
hm_full_score = raise_it(parse_field(parse_float, "full_score"))
hm_full_bias = raise_it(parse_field(parse_float, "full_bias"))
hm_nmatches = raise_it(parse_field(parse_int, "nmatches"))
hm_domain_c_evalue = raise_it(parse_field(parse_float, "domain_c_evalue"))
hm_domain_i_evalue = raise_it(parse_field(parse_float, "domain_i_evalue"))
hm_domain_score = raise_it(parse_field(parse_float, "domain_score"))
hm_domain_bias = raise_it(parse_field(parse_float, "domain_bias"))
hm_hmm_from = raise_it(parse_field(parse_int, "hmm_from"))
hm_hmm_to = raise_it(parse_field(parse_int, "hmm_to"))
hm_query_from = raise_it(parse_field(parse_int, "query_from"))
hm_query_to = raise_it(parse_field(parse_int, "query_to"))
コード例 #13
0
 def _parse_query_neff_line(field: str) -> float:
     return raise_it(
         parse_field(
             split_at_multispace(parse_float, "Neff"),
             "query_neff",
         ))(field)
コード例 #14
0
ファイル: deepsig.py プロジェクト: ccdmb/predector-utils
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    raise_it,
    parse_field,
    parse_str,
    parse_float,
    parse_int,
    parse_or_none,
    is_one_of
)

__all__ = ["DeepSig"]


ds_name = raise_it(parse_field(parse_str, "name"))
ds_prediction = raise_it(parse_field(
    is_one_of(["SignalPeptide", "Transmembrane", "Other"]),
    "prediction"
))
ds_prob = raise_it(parse_field(parse_float, "prob"))
ds_cs_pos = raise_it(parse_field(parse_or_none(parse_int, "-"), "cs_pos"))


class DeepSig(Analysis, GFFAble):

    """     """

    columns = ["name", "prediction", "prob", "cs_pos"]
    types = [str, str, float, int_or_none]
    analysis = "deepsig"
コード例 #15
0
from typing import Optional

from predectorutils.gff import (GFFRecord, Strand)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    parse_field,
    raise_it,
    parse_str,
    parse_float,
    parse_int,
    split_at_eq,
)

tm_name = raise_it(parse_field(parse_str, "name"))
tm_length = raise_it(parse_field(split_at_eq(parse_int, "len"), "length"))
tm_exp_aa = raise_it(parse_field(split_at_eq(parse_float, "ExpAA"), "exp_aa"))
tm_first_60 = raise_it(
    parse_field(split_at_eq(parse_float, "First60"), "first_60"))
tm_pred_hel = raise_it(
    parse_field(split_at_eq(parse_int, "PredHel"), "pred_hel"))
tm_topology = raise_it(
    parse_field(split_at_eq(parse_str, "Topology"), "topology"))


def parse_topology(string: str) -> List[Tuple[int, int]]:
    parts = re.findall(r"(?P<tag>[ncio])(?P<start>\d+)-(?P<end>\d+)", string)
    out = []
    for tag, start, end in parts:
        assert tag in ("i", "o"), string
コード例 #16
0
ファイル: regex.py プロジェクト: ccdmb/predector-utils
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator
from typing import Optional

from predectorutils.gff import GFFRecord, GFFAttributes, Strand
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_int)

re_name = raise_it(parse_field(parse_str, "name"))
re_kind = raise_it(parse_field(parse_str, "kind"))
re_pattern = raise_it(parse_field(parse_str, "pattern"))
re_match = raise_it(parse_field(parse_str, "match"))
re_start = raise_it(parse_field(parse_int, "start"))
re_end = raise_it(parse_field(parse_int, "end"))


class RegexAnalysis(Analysis, GFFAble):

    columns = ["name", "kind", "pattern", "match", "start", "end"]

    types = [str, str, str, str, int, int]

    analysis = "regex"
    software = "predutils"

    def __init__(
        self,
コード例 #17
0
ファイル: deepredeff.py プロジェクト: ccdmb/predector-utils
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_float, is_one_of)

dre_name = raise_it(parse_field(parse_str, "name"))
dre_s_score = raise_it(parse_field(parse_float, "s_score"))
dre_prediction = raise_it(
    parse_field(is_one_of(["effector", "non-effector"]), "prediction"))


class Deepredeff(Analysis):
    """ """
    columns = [
        "name",
        "s_score",
        "prediction",
    ]

    types = [
        str,
        float,
        str,
    ]

    software = "deepredeff"
コード例 #18
0
#!/usr/bin/env python3

import re

from typing import Optional
from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_regex, parse_str,
                                    parse_float, is_one_of)
from predectorutils.analyses.base import float_or_none

e1_name = raise_it(parse_field(parse_str, "name"))
e1_prediction = raise_it(
    parse_field(is_one_of(["Effector", "Non-effector"]), "prediction"))
e1_prob = raise_it(parse_field(parse_float, "prob"))


class EffectorP1(Analysis):
    """ """

    columns = ["name", "prediction", "prob"]
    types = [str, str, float]
    analysis = "effectorp1"
    software = "EffectorP"

    def __init__(self, name: str, prediction: str, prob: float) -> None:
        self.name = name
        self.prediction = prediction
コード例 #19
0
 def _parse_query_line(field: str) -> str:
     return raise_it(
         parse_field(split_at_multispace(parse_str, "Query"),
                     "query"))(field)