def mp_parse_nfasta_header(header: str):
    # E.g. "10001|vfid|47953|vsiid|68790|ssid|SubName: Full=Leader peptidase PilD; SubName: Full=Type 4 prepilin peptidase VcpD; SubName: Full=Type IV-A prepilin peptidase PilD;"
    _REGEXES = {
        "vfid": ("^([^|]+)[ ]*\|vfid[\|]*", "^([^|]+[ ]*\|vfid[\|]*)"),
        "vsiid": ("^([^|]+)[ ]*\|vsiid[\|]*", "^([^|]+[ ]*\|vsiid[\|]*)"),
        "ssid": ("^([^|]+)[ ]*\|ssid[\|]*", "^([^|]+[ ]*\|ssid[\|]*)"),
        "feature_names": ("^[ ]*([^|]+)$", "^([ ]*[^|]+)$")
    }

    out = regex_based_tokenization(_REGEXES, header)
    for key, value in out.items():
        value = value.strip()
        if value.isnumeric():
            out[key] = int(value)

    out["former_id"] = out.pop("source_string")
    out.update({
        k: safe_findall(v, out["feature_names"])
        for k, v in {
            "gene_host": "\[([^\]]+)\] *$",
            "recname_full": "[^_]*RecName:[_ ]Full=([^;]+);",
            "subname_full": "[^_]*SubName:[_ ]Full=([^;]+);",
        }.items()
    })
    return out
Beispiel #2
0
def mp_parse_nfasta_header(header: str):
    _VFDB_REGEXES = {
        "VFID": ("^([^\(\)]+)", "^([^\(\)]+)"),
        "gene_host": ("\[([^\]]+)\] *$", "(\[[^\]]+\] *$)"),
        "gene_name": ("\[([^\]]+)\] *$", "(\[[^\]]+\] *$)"),
        "gene_description": ("([^\(\)]+)$", "([^\(\)]+)$"),
        "gene_symbol": ("\(([^\(\)]+)\) *$", "^\([^\(\)]+\) *$"),
        "gene_accession_id": ("^\(([^\(\)]+)\)", "^\([^\(\)]+\) *"),
    }
    out = regex_based_tokenization(_VFDB_REGEXES, header)
    out["former_id"] = out.pop("source_string")
    out["vfdb_number"] = int(safe_findall("[0-9]+", out["VFID"]))
    return out
Beispiel #3
0
 def mp_parse_pfasta_header(header: str):
     out = regex_based_tokenization(
         {
             "tadb_id": ("^TADB\|([^ ]+) *", "(^TADB\|[^ ]+ *)"),
             "protein_symbol":
             (" *\[([^\[\]]+)\] *$", "( *\[[^\[\]]+\] *$)"),
             "protein_host": (" *\[([^\[\]]+)\] *$", "( *\[[^\[\]]+\] *$)"),
             "protein_geninfo_id": ("^gi\|([0-9]+)\|*", "(^gi\|[0-9]+\|*)"),
             "protein_refseq_id":
             ("^[\| ]*ref\|([^\|]+)[\| ]*", "(^[\| ]*ref\|[^\|]+[\| ]*)"),
             "protein_description": ("(.*)", "(.*)"),
         }, header)
     out["protein_header"] = out.pop("source_string")
     return out
def tokenize_reads_file_name(s: str):
    d = regex_based_tokenization(
        {
            "extension": ["\.(.{2,8})$", "(\..{2,8})$"],  # E.g. '.fastq.gz'
            "last_segment": [
                "[^A-Za-z0-9]([A-Za-z0-9]+)$", "([^A-Za-z0-9][A-Za-z0-9]+)$"
            ],  # The last segment is always 001,
            "read_index": ["[^A-Za-z0-9](R[0-9]+)$", "([^A-Za-z0-9]R[0-9]+)$"],
            "lane_number":
            ["[^A-Za-z0-9](L[0-9]+)$", "([^A-Za-z0-9]L[0-9]+)$"],
            "sample_sheet_number":
            ["[^A-Za-z0-9](S[0-9]+)$", "([^A-Za-z0-9]S[0-9]+)$"],
            "sample_name": ["(.+)", "(.+)"],
        },
        os.path.basename(s))
    d["reads_file"] = s
    return d
Beispiel #5
0
 def mp_parse_nfasta_header(header: str):
     out = regex_based_tokenization(
         {
             "tadb_id": ("^TADB\|([^ ]+) *", "(^TADB\|[^ ]+ *)"),
             "gene_symbol": (" *\[([^\[\]]+)\] *$", "( *\[[^\[\]]+\] *$)"),
             "gene_geninfo_id":
             ("^gi\|([0-9]+)[\| ]*", "(^gi\|[0-9]+[\|]*)"),
             "gene_refseq_id":
             ("^[\| ]*ref\|([^\|]+)[\| ]*", "(^[\| ]*ref\|[^\|]+[\| ]*)"),
             "dna_strand": ("^[\| ]*:([c]*)", "(^[\| ]*:[c]*)"),
             "start_locus": ("^([0-9]+)[ -]*", "(^[0-9]+[ -]*)"),
             "end_locus": ("^[ -]*([0-9]+)[ -]*", "(^[ -]*[0-9]+[ -]*)"),
             "gene_description": ("(.*)", "(.*)"),
         }, header)
     out["former_id"] = out.pop("source_string")
     out["is_antisense_dna_strand"] = out["dna_strand"] == "c"
     out["tadb_number"] = safe_findall("^[A-Z]*([0-9]+)",
                                       str(out["tadb_id"]).upper(),
                                       verbose=False)
     return out