Exemple #1
0
def test_cmdline_tokenizer(nlp):
    tokenizer = CommandLineTokenizer(nlp.vocab)
    nlp.tokenizer = tokenizer
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    doc = nlp(
        r'"C:\Program Files\MyProgram.exe" /d C:\Users\Alice\file.txt --file C:\test.py'
    )
    doc_cmd = nlp(
        r'"C:\Program Files\MyProgram.exe" /d "C:\Users\Alice\file.txt --file C:\test.py"'
    )
    doc_cmd_c = nlp(
        r'C:\Windows\System32\cmd.exe /c C:\Users\Alice\file.txt --f C:\test.py'
    )
    doc_cmd_k = nlp(
        r'C:\Windows\System32\cmd.exe /k C:\Users\Alice\file.txt --f C:\test.py'
    )
    assert doc._.tokens == [
        r'"C:\Program Files\MyProgram.exe"', r'/d', r'C:\Users\Alice\file.txt',
        r'--file', r'C:\test.py'
    ]
    assert doc_cmd._.tokens == [
        r'"C:\Program Files\MyProgram.exe"', r'/d',
        r'"C:\Users\Alice\file.txt --file C:\test.py"'
    ]
    assert doc_cmd_c._.tokens == [
        r'C:\Windows\System32\cmd.exe', r'/c',
        r'"C:\Users\Alice\file.txt --f C:\test.py"'
    ]
    assert doc_cmd_k._.tokens == [
        r'C:\Windows\System32\cmd.exe', r'/k',
        r'"C:\Users\Alice\file.txt --f C:\test.py"'
    ]
Exemple #2
0
def test_cmdline_cmd(nlp):
    nlp.tokenizer = CommandLineTokenizer(nlp.vocab)
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    doc = nlp(
        r'"C:\Program Files\MyProgram.exe" /d "C:\Users\Alice\appdata\local\temp\file.txt --file C:\test.py"'
    )
    assert doc._.has_cmd == True
    assert doc[2]._.is_cmd == True
    idx, cmd = doc._.cmd[0]
    assert idx == 2
    assert cmd.text == r'"C:\Users\Alice\appdata\local\temp\file.txt --file C:\test.py"'
    assert cmd._.stem == r'"C:\Users\Alice\appdata\local\temp\file.txt --file C:\test.py"'
Exemple #3
0
def test_cmdline_arg(nlp):
    nlp.tokenizer = CommandLineTokenizer(nlp.vocab)
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    doc = nlp(
        r'"C:\Program Files\MyProgram.exe" /d C:\Users\Alice\file.txt --file C:\test.py'
    )
    assert doc._.has_arg == True
    assert doc[1]._.is_arg == True
    assert doc[3]._.is_arg == True
    idx, arg = doc._.arg[0]
    assert idx == 1
    assert arg.text == "/d"
    assert arg._.stem == "/d"
Exemple #4
0
def test_cmdline_path(nlp):
    nlp.tokenizer = CommandLineTokenizer(nlp.vocab)
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    doc = nlp(
        r'"C:\Program Files\MyProgram.exe" /d "C:\Users\Alice\file.txt --file C:\test.py"'
    )
    assert doc._.has_path == True
    assert doc[0]._.is_path == True
    assert doc[2]._.is_path == False
    idx, path = doc._.path[0]
    assert idx == 0
    assert path.text == r'"C:\Program Files\MyProgram.exe"'
    assert path._.stem == r'"?pf64\myprogram.exe"'
Exemple #5
0
def test_cmdline_val(nlp):
    nlp.tokenizer = CommandLineTokenizer(nlp.vocab)
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    doc = nlp(
        r'"C:\Program Files\MyProgram.exe" /d "C:\Users\Alice\file.txt --file C:\test.py" -f C:\Users\Bob\file'
    )
    assert doc._.has_val == True
    assert doc[2]._.is_val == True
    assert doc[4]._.is_val == True
    idx, val = doc._.val[1]
    assert idx == 4
    assert val.text == r'C:\Users\Bob\file'
    assert val._.stem == r'?usr\file'
Exemple #6
0
def test_cmdline_sub_cmd(nlp):
    nlp.tokenizer = CommandLineTokenizer(nlp.vocab)
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    doc = nlp(
        r'"C:\Program Files\MyProgram.exe" /d "C:\Users\Alice\appdata\local\temp\file.txt --file C:\test.py"'
    )
    assert type(doc[2]._.sub_cmd).__name__ == "Doc"
    assert [t for t in doc[2]._.sub_cmd._.tokens] == [
        r'C:\Users\Alice\appdata\local\temp\file.txt', r'--file', r'C:\test.py'
    ]
    assert doc[2]._.sub_cmd._.stems == [
        r'?usrtmp\file.txt', '--file', r'?c\test.py'
    ]
    assert doc[2]._.sub_cmd[0]._.is_path == True
    assert doc[2]._.sub_cmd[1]._.is_arg == True
    assert doc[2]._.sub_cmd[2]._.is_val == True
Exemple #7
0
def test_cmd_normalization(nlp):
    nlp.tokenizer = CommandLineTokenizer(nlp.vocab)
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    doc = nlp(
        r'"C:\Program Files\MyProgram.exe" /d "C:\Users\Alice\appdata\local\temp\file.txt --file C:\test.py"'
    )
    doc2 = nlp(
        r'"C:\Program Files\MyProgram.exe" /d C:\Users\Alice\appdata\local\temp\file.txt --file C:\test.py'
    )
    doc3 = nlp(
        r'C:\Windows\System32\cmd.exe /c C:\Users\Alice\appdata\local\temp\file.txt --file C:\test.py'
    )
    assert doc._.normalize == r'"?pf64\myprogram.exe" /d "?usrtmp\file.txt --file ?c\test.py"'
    assert doc[
        2]._.sub_cmd._.normalize == r'?usrtmp\file.txt --file ?c\test.py'
    assert doc2._.normalize == r'"?pf64\myprogram.exe" /d ?usrtmp\file.txt --file ?c\test.py'
    assert doc3._.normalize == r'?sys64\cmd.exe /c "?usrtmp\file.txt --file ?c\test.py"'
    assert doc3[
        2]._.sub_cmd._.normalize == r'?usrtmp\file.txt --file ?c\test.py'
Exemple #8
0
def test_cmdline_stemming(nlp):
    nlp.tokenizer = CommandLineTokenizer(nlp.vocab)
    cmdline_tagger = CommandLineTagger(nlp)
    nlp.add_pipe(cmdline_tagger, last=True)
    guid = nlp(r'\?\Volume{26a21bda-a627-11d7-9931-806e6f6e6963}')
    systemdrive = nlp(r'\??\C:\test \?\C:\test  C:\test')
    systemroot = nlp(r'''C:\Windows\test''')
    usrTempPath = nlp(r'C:\Users\Alice\appdata\local\temp\test')
    usrPath = nlp(r'''C:\Users\Alice\test''')
    ProgFiles86 = nlp(r'''"C:\Program Files\test"''')
    ProgFiles64 = nlp(r'"C:\Program Files (x86)\test"')
    Sys86 = nlp(r'C:\Windows\Syswow64')
    Sys64 = nlp(r'C:\Windows\System32')
    assert guid._.stems == [r'\?\volume{guid}']
    assert systemdrive._.stems == [r'?c\test', r'?c\test', r'?c\test']
    assert systemroot._.stems == [r'?win\test']
    assert usrTempPath._.stems == [r'?usrtmp\test']
    assert usrPath._.stems == [r'?usr\test']
    assert ProgFiles86._.stems == [r'"?pf64\test"']
    assert ProgFiles64._.stems == [r'"?pf86\test"']
    assert Sys86._.stems == [r'?sys32']
    assert Sys64._.stems == [r'?sys64']
Exemple #9
0
def test_tokenizer_integration(nlp):
    tokenizer = CommandLineTokenizer(nlp.vocab)
    nlp.tokenizer = tokenizer
    assert type(nlp.tokenizer).__name__ == 'CommandLineTokenizer'