def mp_parse_nfasta_header(header: str):
    # E.g. "10001|vfid|47953|vsiid|68790|ssid|SubName: Full=Leader peptidase PilD; SubName: Full=Type 4 prepilin peptidase VcpD; SubName: Full=Type IV-A prepilin peptidase PilD;"
    _REGEXES = {
        "vfid": ("^([^|]+)[ ]*\|vfid[\|]*", "^([^|]+[ ]*\|vfid[\|]*)"),
        "vsiid": ("^([^|]+)[ ]*\|vsiid[\|]*", "^([^|]+[ ]*\|vsiid[\|]*)"),
        "ssid": ("^([^|]+)[ ]*\|ssid[\|]*", "^([^|]+[ ]*\|ssid[\|]*)"),
        "feature_names": ("^[ ]*([^|]+)$", "^([ ]*[^|]+)$")
    }

    out = regex_based_tokenization(_REGEXES, header)
    for key, value in out.items():
        value = value.strip()
        if value.isnumeric():
            out[key] = int(value)

    out["former_id"] = out.pop("source_string")
    out.update({
        k: safe_findall(v, out["feature_names"])
        for k, v in {
            "gene_host": "\[([^\]]+)\] *$",
            "recname_full": "[^_]*RecName:[_ ]Full=([^;]+);",
            "subname_full": "[^_]*SubName:[_ ]Full=([^;]+);",
        }.items()
    })
    return out
Beispiel #2
0
def mp_parse_nfasta_header(header: str):
    _VFDB_REGEXES = {
        "VFID": ("^([^\(\)]+)", "^([^\(\)]+)"),
        "gene_host": ("\[([^\]]+)\] *$", "(\[[^\]]+\] *$)"),
        "gene_name": ("\[([^\]]+)\] *$", "(\[[^\]]+\] *$)"),
        "gene_description": ("([^\(\)]+)$", "([^\(\)]+)$"),
        "gene_symbol": ("\(([^\(\)]+)\) *$", "^\([^\(\)]+\) *$"),
        "gene_accession_id": ("^\(([^\(\)]+)\)", "^\([^\(\)]+\) *"),
    }
    out = regex_based_tokenization(_VFDB_REGEXES, header)
    out["former_id"] = out.pop("source_string")
    out["vfdb_number"] = int(safe_findall("[0-9]+", out["VFID"]))
    return out
Beispiel #3
0
def regex_based_tokenization(regex_dict: dict,
                             string: str,
                             include_source: bool = True,
                             verbose: bool = False):
    # Column name, regex to extract, regex to excise
    out = dict()
    if include_source:
        out["source_string"] = string
    _string = str(string)

    for key, regexes in regex_dict.items():
        _string = _string.strip()
        if len(regexes) == 1:
            extract_regex = excise_regex = regexes[0]
        else:
            extract_regex = regexes[0]
            excise_regex = regexes[1]
        token = safe_findall(extract_regex, _string, verbose=verbose)

        out[key] = safe_findall(extract_regex, _string,
                                verbose=verbose).strip()
        _string = re.sub(excise_regex, "", _string)
    return out
Beispiel #4
0
 def mp_parse_nfasta_header(header: str):
     out = regex_based_tokenization(
         {
             "tadb_id": ("^TADB\|([^ ]+) *", "(^TADB\|[^ ]+ *)"),
             "gene_symbol": (" *\[([^\[\]]+)\] *$", "( *\[[^\[\]]+\] *$)"),
             "gene_geninfo_id":
             ("^gi\|([0-9]+)[\| ]*", "(^gi\|[0-9]+[\|]*)"),
             "gene_refseq_id":
             ("^[\| ]*ref\|([^\|]+)[\| ]*", "(^[\| ]*ref\|[^\|]+[\| ]*)"),
             "dna_strand": ("^[\| ]*:([c]*)", "(^[\| ]*:[c]*)"),
             "start_locus": ("^([0-9]+)[ -]*", "(^[0-9]+[ -]*)"),
             "end_locus": ("^[ -]*([0-9]+)[ -]*", "(^[ -]*[0-9]+[ -]*)"),
             "gene_description": ("(.*)", "(.*)"),
         }, header)
     out["former_id"] = out.pop("source_string")
     out["is_antisense_dna_strand"] = out["dna_strand"] == "c"
     out["tadb_number"] = safe_findall("^[A-Z]*([0-9]+)",
                                       str(out["tadb_id"]).upper(),
                                       verbose=False)
     return out
def parse_blast_record(blast_record: Record):
    # Based on: https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc95
    high_scoring_pairs = OrderedDict()
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                t = alignment.title
                d = dict(title=t,
                         length=alignment.length,
                         expect=hsp.expect,
                         score=hsp.score,
                         bits=hsp.bits,
                         identities=hsp.identities,
                         positives=hsp.positives,
                         query=hsp.query,
                         match=hsp.match,
                         sbjct=hsp.sbjct,
                         geninfo_id=safe_findall("gi\|([^|]+)\|", t).strip())
                high_scoring_pairs[t] = {
                    k: clear_non_printing_chars(v)
                    for k, v in d.items()
                }
    return high_scoring_pairs