예제 #1
0
"""
:type: OpenAttack.utils.BertClassifier
:Size: 1.23GB
:Package Requirements:
    * transformers
    * pytorch

Pretrained BERT model on MNLI dataset. See :py:data:`Dataset.MNLI` for detail.
"""

from OpenAttack.utils import make_zip_downloader, BertClassifier

NAME = "Victim.BERT.MNLI"

URL = "https://cdn.data.thunlp.org/TAADToolbox/victim/bert_mnli.zip"
DOWNLOAD = make_zip_downloader(URL)


def LOAD(path):
    from OpenAttack import Classifier
    return BertClassifier(path, 2)
예제 #2
0
"""
:type: function
:Size: 2.41MB

Model files for pos tagger in nltk.
`[code] <https://github.com/sloria/textblob-aptagger>`__
"""
from OpenAttack.utils import make_zip_downloader
import os

NAME = "TProcess.NLTKPerceptronPosTagger"

URL = "/TAADToolbox/averaged_perceptron_tagger.pickle.zip"
DOWNLOAD = make_zip_downloader(URL, "averaged_perceptron_tagger.pickle")


def LOAD(path):
    ret = __import__("nltk").tag.PerceptronTagger(load=False)
    ret.load("file:" + os.path.join(path, "averaged_perceptron_tagger.pickle"))
    return ret.tag
예제 #3
0
"""
:type: OpenAttack.utils.WordVector
:Size: 3GB
"""
import numpy as np
import os
from OpenAttack.utils import make_zip_downloader

NAME = "AttackAssist.ChineseWord2Vec"

URL = "/TAADToolbox/chinese-merge-word-embedding.txt.zip"
DOWNLOAD = make_zip_downloader(URL, "chinese-merge-word-embedding.txt")


def LOAD(path):
    from OpenAttack.attack_assist import WordEmbedding
    with open(os.path.join(path, "chinese-merge-word-embedding.txt"), "r", encoding="utf-8") as f:
        id2vec = []
        word2id = {}
        # f.readline()
        for line in f.readlines():
            tmp = line.strip().split(' ')
            word = tmp[0]
            embed = np.array([float(x) for x in tmp[1:]])
            if len(embed) != 300:
                continue
            word2id[word] = len(word2id)
            id2vec.append(embed)
        id2vec = np.stack(id2vec)
    return WordEmbedding(word2id, id2vec)
예제 #4
0
"""
:type: function
:Size: 158.351KB

Model files for nltk punkt sentence tokenizer.
"""
from OpenAttack.utils import make_zip_downloader
import os

NAME = "TProcess.NLTKSentTokenizer"

URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/punkt.english.pickle.zip"
DOWNLOAD = make_zip_downloader(URL, "english.pickle")


def LOAD(path):
    return __import__("nltk").data.load("file:" + os.path.join(path, "english.pickle")).tokenize

예제 #5
0
"""
:type: OpenAttack.utils.WordVector
:Size: 61.998MB

Counter-fitting Word Vectors to Linguistic Constraints.
`[pdf] <https://www.aclweb.org/anthology/N16-1018.pdf>`__
"""
import numpy as np
import os
from OpenAttack.utils import make_zip_downloader

NAME = "AttackAssist.CounterFit"

URL = "/TAADToolbox/counter-fitted-vectors.txt.zip"
DOWNLOAD = make_zip_downloader(URL, "counter-fitted-vectors.txt")


def LOAD(path):
    from OpenAttack.attack_assist import WordEmbedding
    with open(os.path.join(path, "counter-fitted-vectors.txt"), "r", encoding='utf-8') as f:
        id2vec = []
        word2id = {}
        for line in f.readlines():
            tmp = line.strip().split(" ")
            word = tmp[0]
            embed = np.array([float(x) for x in tmp[1:]])
            if len(embed) != 300:
                continue
            word2id[word] = len(word2id)
            id2vec.append(embed)
        id2vec = np.stack(id2vec)