default="https://www.google.com",
                    help="Website to test")

args = parser.parse_args()
tokenizerFolder = args.tokenizer_folder
savedModelDirectory = args.model_dir
websiteToTest = args.website_to_test
threshold = args.threshold

# Loading files
# Load tokenization files
tokenizer = ByteLevelBPETokenizer(
    tokenizerFolder + "/tokenizer.tok-vocab.json",
    tokenizerFolder + "/tokenizer.tok-merges.txt",
)
tokenizerVocabSize = tokenizer.get_vocab_size()
print("Tokenizer files have been loaded and the vocab size is %d..." %
      tokenizerVocabSize)

# Load saved model
model = load(savedModelDirectory + "/phishytics-model.joblib")
print("Model loaded...")

# Load document frequency dictionary
docDict = np.load(savedModelDirectory +
                  "/phishytics-model-tfidf-dictionary.npy",
                  allow_pickle=True).item()
print("Document frequency dictionary loaded...")

# Testing
print("Loading webpage...")
def architecture_search(process_id):
    os.makedirs(f"checkpoints/{process_id+1}")
    os.makedirs(f"tokenizer/{process_id+1}")

    files = glob.glob("../../data/pre_abstract_txts/*.txt")

    tok_sizes = list(range(100, 2000, 100))
    hidden_sizes = list(range(12, 300, 12))
    emb_sizes = list(range(10, 250, 10))
    cased = [True, False]

    batch_size = 1

    results = {}
    choices = list(itertools.product(tok_sizes, hidden_sizes, emb_sizes,
                                     cased))
    random.shuffle(choices)

    best_acc = -np.inf
    while len(choices) > 0:
        tok_size, hidden_size, emb_size, cased = choices.pop()
        print(tok_size, hidden_size, emb_size, cased)

        tokenizer = ByteLevelBPETokenizer(lowercase=cased)
        tokenizer.train(files, vocab_size=tok_size, special_tokens=["[PAD]"])

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        dataset = TextDataset(data_dir="../../data/pre_abstract_txts",
                              labels_dir="../../data/pre_abstract_labels",
                              device=device,
                              tokenizer=tokenizer,
                              batch_size=batch_size)
        test_dataset = TextDataset(
            data_dir="../../data/pre_abstract_txts",
            labels_dir="../../data/pre_abstract_labels_test",
            device=device,
            tokenizer=tokenizer,
            batch_size=batch_size)
        model = LSTMTagger(vocab_size=tokenizer.get_vocab_size(),
                           embedding_dim=emb_size,
                           lstm_dim=hidden_size,
                           dropout=0,
                           n_classes=len(dataset.classes)).to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters())
        # optimizer = torch.optim.SGD(model.parameters(), momentum=0.9, nesterov=True, lr=v)

        epoch = 0
        n = 3
        test_acc = -np.inf
        log_interval = 10  # all n batches
        weights = copy.deepcopy(model.state_dict())
        while True:
            dataset.shuffle()
            epoch += 1
            model.train()
            total_loss = 0.
            pbar = tqdm.tqdm(enumerate(dataset), desc=f"epoch {epoch}")
            for i, (x, y) in pbar:
                # reset gradients
                optimizer.zero_grad()
                # feed forward batch
                output = model(x)
                # calculate loss
                loss = criterion(output.transpose(1, 2), y)
                # back propagate loss
                loss.backward()
                # norm and clip gradients
                # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()

                pbar.set_description(
                    f'epoch {epoch} | batch {i + 1:d}/{len(dataset)} | loss {loss.item():.2f}'
                )

            model.eval()
            a, c = 0, 0
            with torch.no_grad():
                t_loss = 0
                for i, (x, y) in enumerate(test_dataset):
                    output = model(x)
                    loss = criterion(output.transpose(1, 2), y)
                    t_loss += loss.item()
                    for p, t in zip(torch.argmax(output, -1), y):
                        for pi, ti in zip(p, t):
                            a += 1
                            if pi == ti:
                                c += 1
                acc = c / a
                if acc <= test_acc and n > 0:
                    n -= 1
                    continue
                elif acc <= test_acc:
                    break
                print(t_loss, acc)
                weights = copy.deepcopy(model.state_dict())
                test_acc = acc
        results[(tok_size, hidden_size, emb_size, cased)] = acc
        print(
            list(
                sorted([(k, v) for k, v in results.items()],
                       key=lambda y: y[1],
                       reverse=True))[:10])
        print(best_acc, test_acc)
        if test_acc > best_acc:
            best_acc = test_acc
            dir_path = f"tokenizer/{process_id+1}/lstm-tagger-{best_acc:.6f}"
            if os.path.exists(dir_path):
                continue
            torch.save(
                weights,
                f"checkpoints/{process_id+1}/lstm-tagger-{best_acc:.6f}.pt")
            os.makedirs(dir_path)
            tokenizer.save(dir_path)
Beispiel #3
0
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """

    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.special_tok_map = {}  # map from HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if PathManager.exists(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if PathManager.exists(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.bpe_dropout:
            raise NotImplementedError(
                '--bpe-dropout is not supported with ByteLevelBPE because tokenizers '
                'library does not allow dynamically turning BPE on/off. You can use '
                '--dict-tokenizer slow_bytelevel_bpe to gain this feature.'
            )

        if self.lower:
            warn_once('Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).'
            )
        if 'bpe_vocab' not in opt:
            raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError('--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError(
                '--bpe-vocab and --bpe-merge are mandatory with '
                '--dict-tokenizer bytelevelbpe'
            )

        if not PathManager.exists(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not PathManager.exists(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(
            self.vocab_path, self.merge_path, self.add_prefix_space
        )

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(
        self, tokens: List[str], token_ids: List[int], delimiter: str
    ) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids, skip_special_tokens=False)

        return text

    def add_special_tokens(self, dict_agent, special_tokens: List[str]):
        """
        Add special tokens to the tokenizer and dict_agent.
        """
        logging.debug(f'adding the following special tokens: {special_tokens}')
        self.tokenizer.add_special_tokens(special_tokens)  # add to HF

        for tok in special_tokens:
            parlai_key = dict_agent[tok]
            hf_key = self.tokenizer.token_to_id(tok)
            self.special_tok_map[parlai_key] = hf_key

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.add_special_tokens(dict_agent, special_tokens)

        for i in range(self.tokenizer.get_vocab_size() - len(special_tokens)):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save_model(dir_name, file_name)
Beispiel #4
0
for (_, _, f) in walk(labeledDataFolder + "/legitimate_htmls"):
    files.extend(
        [labeledDataFolder + "/legitimate_htmls/" + file for file in f])
for (_, _, f) in walk(labeledDataFolder + "/phishing_htmls"):
    files.extend([labeledDataFolder + "/phishing_htmls/" + file for file in f])
print("Total number of html files: %d\n" % len(files))

# Writing data, one html file per line. This is the format the tokenizer expects
print("Writing html data into a single file...")
output = open("tokenizer/htmlCodePerLine.txt", "w")
count = 0
for file in files:
    count = count + 1
    print("Files processed: %d, Total files: %d" % (count, len(files)))
    fileData = io.open(file, "r", errors="ignore").readlines()
    fileData = ''.join(str(line) for line in fileData)
    fileData = fileData.replace("\n", " ")
    output.write(fileData + "\n")
output.close()

# Starting tokenization
print("\nStarting tokenization with BPE")
tokenizer = ByteLevelBPETokenizer()
tokenizer.train("tokenizer/htmlCodePerLine.txt",
                min_frequency=minFrequency,
                vocab_size=vocabSize)
print(
    "Vocabulary size is: %d\nNOTE: Sometimes, the vocab size might not be equal to the input 'vocab_size'\n"
    % (tokenizer.get_vocab_size()))
tokenizer.save("tokenizer", "tokenizer.tok")
print("Tokenizer files have been saved in 'tokenizer' directory...")
Beispiel #5
0
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """
    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if os.path.isfile(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if os.path.isfile(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.lower:
            raise ValueError(
                'Only use --dict-lower false with --dict-tokenizer bytelevelbpe'
            )
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).')
        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(self, tokens: List[str], token_ids: List[int],
                      delimiter: str) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids)
        return text

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.tokenizer.add_special_tokens(special_tokens)
        for i in range(self.tokenizer.get_vocab_size() - 4):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save(dir_name, file_name)
Beispiel #6
0
class CodeTrainedBPE_Translation_DataProcessor(DataProcessor, Dataset):
    def __init__(self, task_data, max_src_len=512, max_tgt_len=512):
        """
        This data processor tokenizes and numericalises using a custom byte pair 
        encoding trained on the codeSearchNet train data with full docstrings.
        """
        self.task_data = task_data
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len
        self.tokenizer = ByteLevelBPETokenizer(
            "/nfs/phd_by_carlos/notebooks/datasets/code_search_net/code_bpe_hugging_32k-vocab.json",
            "/nfs/phd_by_carlos/notebooks/datasets/code_search_net/code_bpe_hugging_32k-merges.txt"
        )
        self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"])
        self.SOS = self.tokenizer.encode("[SOS]").ids[0]
        self.EOS = self.tokenizer.encode("[EOS]").ids[0]
        self.PAD = self.tokenizer.encode("[PAD]").ids[0]
        self.CLS = self.tokenizer.encode("[CLS]").ids[0]

        self.__remove_long_samples()

    def __len__(self):
        return len(self.task_data)

    def __getitem__(self, idx):
        src, tgt = self.task_data[idx]
        sample = {'src': self.encode(src), 'tgt': self.encode(tgt)}
        return sample

    @property
    def vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def __remove_long_samples(self):
        for i in tqdm.tqdm(list(reversed(range(len(self.task_data)))),
                           desc="removing long samples"):
            src, tgt = self.task_data[i]
            if len(self.encode(src)) > self.max_src_len or len(
                    self.encode(tgt)) > self.max_tgt_len:
                del self.task_data[i]

    def encode(self, sample):
        """
        sample: str: the input string to encode
        """
        return [self.SOS] + self.tokenizer.encode(sample).ids + [self.EOS]

    def encode_src(self, sample):
        return self.encode(sample)

    def encode_tgt(self, sample):
        return self.encode(sample)

    def encode_to_tensor(self, input_samples):
        """
        input_samples: [str]: one or more strings to convert to a single padded tensor. (Seq_len x batch)
        """
        return pad_sequence([
            torch.Tensor(self.encode(sample)).type(torch.LongTensor)
            for sample in input_samples
        ],
                            padding_value=self.PAD)

    def collate(self, input_samples):
        """
        input_samples: [dict]: these are samples obtained through the _get_item method
        """
        collated_samples = {}
        sample_keys = input_samples[0].keys()
        for key in sample_keys:
            collated_samples[key] = torch.nn.utils.rnn.pad_sequence(
                [
                    torch.Tensor(sample[key]).type(torch.LongTensor)
                    for sample in input_samples
                ],
                padding_value=self.PAD)
        return collated_samples

    def decode(self, ids):
        """
        ids: [int]: ids to decode
        """
        return self.tokenizer.decode(ids)

    def decode_src(self, ids):
        return self.decode(ids)

    def decode_tgt(self, ids):
        return self.decode(ids)

    def validate_prediction(self, numerical_sequence):
        # there are no constraints
        return True

    def prediction_is_complete(self, numerical_sequence):
        return self.EOS in numerical_sequence

    def decode_tensor(self, output_tensor):
        """
        output_tensor: [[int]]: model output (Seq_len x batch)
        """
        batch_first_output_tensor = output_tensor.T
        return [
            self.decode(sequence.cpu().tolist())
            for sequence in batch_first_output_tensor
        ]

    def to_dataloader(self,
                      batch_size,
                      repeat=False,
                      num_workers=4,
                      shuffle=True):
        """
        This function returns an iterable object with all the data batched.
        
        >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100)
        >>> dataloader = BPE_processor.to_dataloader(2)
        
        >>> for i_batch, sample_batched in enumerate(dataloader):
        >>>     print(sample_batched["tgt"])
        >>>     print(BPE_processor.decode_tensor(sample_batched["tgt"]))
        >>>     break
        """
        return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\
                           drop_last=False, collate_fn = self.collate, shuffle=shuffle)

    def save(self, path):
        torch.save(self, path)
Beispiel #7
0
class Parse_Tree_Translation_DataProcessor(Dataset):
    def __init__(
            self,
            task_data,
            max_length=500,
            tokenizer_dir="/nfs/phd_by_carlos/notebooks/datasets/code_search_net/",
            grammar_path="src/tree-sitter/tree-sitter-python/src/grammar.json",
            **kwargs):
        self.task_data = task_data
        self.max_length = max_length
        self.tokenizer = ByteLevelBPETokenizer(
            tokenizer_dir + "code_bpe_hugging_32k-vocab.json",
            tokenizer_dir + "code_bpe_hugging_32k-merges.txt")
        self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"])
        self.SOS = self.tokenizer.encode("[SOS]").ids[0]
        self.EOS = self.tokenizer.encode("[EOS]").ids[0]
        self.PAD = self.tokenizer.encode("[PAD]").ids[0]
        self.CLS = self.tokenizer.encode("[CLS]").ids[0]

        with open(grammar_path, "r") as grammar_file:
            self.python_grammar = json.load(grammar_file)

        extra_externals = {
            "_string_start": {
                "type": "PATTERN",
                "value": '"'
            },
            "_string_content": {
                "type": "PATTERN",
                "value": "[A-Za-z0-9 _,.()\/{}!$@'*]*"
            },
            "_string_end": {
                "type": "PATTERN",
                "value": '"'
            },
            "_newline": {
                "type": "BLANK"
            }
        }
        for node_type, member in extra_externals.items():
            self.python_grammar["rules"][node_type] = member

        self.python_parser = Code_Parser(self.python_grammar, "python",
                                         **kwargs)
        self.node_processor = Node_Processor()
        self.tree_vocab, grammar_patterns = get_grammar_vocab(
            self.python_grammar)

        self.tokenizer.add_tokens(["<REDUCE>"])
        for tree_token in sorted(self.tree_vocab):
            if len(self.tokenizer.encode(tree_token).tokens) != 1:
                self.tokenizer.add_tokens([tree_token])

        # filtering the data
        filtered_task_data = []
        for desc, code in self.task_data:
            numerical_code_sequence = self.encode_tgt(code)
            numerical_desc_sequence = self.encode_src(desc)
            token_sequence = self.numerical_to_token_sequence(
                numerical_code_sequence)
            if self.python_parser.is_valid_sequence(token_sequence) and len(
                    token_sequence) <= max_length and len(
                        numerical_desc_sequence) <= max_length:
                filtered_task_data.append((desc, code))
            elif len(token_sequence) > max_length or len(
                    numerical_desc_sequence) > max_length:
                print(
                    f"Sequence too long: src->{len(numerical_desc_sequence)}, tgt->{len(token_sequence)}"
                )
            else:
                print(f"Could not parse and reconstruct: {code}")
        self.task_data = filtered_task_data

    def __len__(self):
        return len(self.task_data)

    def __getitem__(self, idx):
        if idx >= len(self):
            raise IndexError

        src, tgt = self.task_data[idx]
        sample = {'src': self.encode_src(src), 'tgt': self.encode_tgt(tgt)}
        return sample

    @property
    def vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def encode_src(self, desc_str):
        return [self.SOS] + self.tokenizer.encode(desc_str).ids + [self.EOS]

    def encode_tgt(self, code_str):
        code_sequence = self.python_parser.code_to_sequence(code_str)
        numerical_code = []
        for code_token in code_sequence:
            numerical_code += self.tokenizer.encode(code_token).ids
        return [self.SOS] + numerical_code + [self.EOS]

    def decode_src(self, numerical_desc):
        """
        ids: [int]: ids to decode
        """
        return self.tokenizer.decode(ids)

    def numerical_to_token_sequence(self, numerical_code):
        token_sequence = [
            self.tokenizer.decode([token_idx]) for token_idx in numerical_code
            if token_idx not in [self.SOS, self.EOS, self.PAD, self.CLS]
        ]
        return token_sequence

    def decode_tgt(self, numerical_code):
        token_sequence = self.numerical_to_token_sequence(numerical_code)
        partial_tree = self.python_parser.sequence_to_partial_tree(
            token_sequence)
        return self.node_processor.pretty_print(
            partial_tree.root), partial_tree

    def validate_prediction(self, current_prediction):
        #         print(f"validating: {current_prediction}")
        token_sequence = self.numerical_to_token_sequence(current_prediction)
        return self.python_parser.is_valid_sequence(token_sequence)

    def prediction_is_complete(self, current_prediction):
        token_sequence = self.numerical_to_token_sequence(current_prediction)
        return self.python_parser.sequence_to_partial_tree(
            token_sequence).is_complete

    def collate(self, input_samples):
        """
        input_samples: [dict]: these are samples obtained through the _get_item method
        """
        collated_samples = {}
        sample_keys = input_samples[0].keys()
        for key in sample_keys:
            collated_samples[key] = torch.nn.utils.rnn.pad_sequence(
                [
                    torch.Tensor(sample[key]).type(torch.LongTensor)
                    for sample in input_samples
                ],
                padding_value=self.PAD)
        return collated_samples

    def to_dataloader(self, batch_size, num_workers=4, shuffle=True):
        """
        This function returns an iterable object with all the data batched.
        
        >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100)
        >>> dataloader = BPE_processor.to_dataloader(2)
        
        >>> for i_batch, sample_batched in enumerate(dataloader):
        >>>     print(sample_batched["tgt"])
        >>>     print(BPE_processor.decode_tensor(sample_batched["tgt"]))
        >>>     break
        """
        return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\
                           drop_last=False, collate_fn = self.collate, shuffle=shuffle)

    def save(self, path):
        torch.save(self, path)
Beispiel #8
0
    def __init__(self, path, vocab_size=-1, use_bpe=False, tokenizer_data=""):
        self.dictionary = Dictionary()

        if use_bpe:
            assert os.path.exists(path), "Path does not exist: " + path

            print(
                "-------------------------------------------------------------"
            )

            tokenizer = ByteLevelBPETokenizer()
            if len(tokenizer_data) != 0:
                print("Training tokenizer on: " +
                      os.path.join(tokenizer_data, 'train.txt'))
                tokenizer.train([os.path.join(tokenizer_data, 'train.txt')],
                                vocab_size=vocab_size,
                                show_progress=False)
            else:
                print("Training tokenizer on: " +
                      os.path.join(path, 'train.txt'))
                tokenizer.train(
                    [
                        os.path.join(path, 'train.txt')
                        # os.path.join(path, 'valid.txt'),
                        # os.path.join(path, 'test.txt')
                    ],
                    vocab_size=vocab_size,
                    show_progress=False)
            print(
                "-------------------------------------------------------------"
            )

            print("Encoding dataset at: " + path)
            with open(os.path.join(path, 'train.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.train = ids
                self.dictionary.avg_characters_per_token['train'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'valid.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.valid = ids
                self.dictionary.avg_characters_per_token['valid'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'test.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.test = ids
                self.dictionary.avg_characters_per_token['test'] = len(
                    text) / len(enc.ids)
            print(
                "-------------------------------------------------------------"
            )

            self.dictionary.word2idx = tokenizer.get_vocab()
            self.dictionary.idx2word = [
                tokenizer.id_to_token(x)
                for x in range(tokenizer.get_vocab_size())
            ]
            self.dictionary.total = tokenizer.get_vocab_size()

        else:
            self.train = self.tokenize(os.path.join(path, 'train.txt'))
            self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
            self.test = self.tokenize(os.path.join(path, 'test.txt'))
def getURL():
	if request.method == 'POST':
		urlname  = request.form['url']
		url = request.form['url']
		print(url)
		tokenizerFolder = "tokenizer"
		savedModelDirectory = "saved_models"
		websiteToTest = url
		threshold = 0.5
		tokenizer = ByteLevelBPETokenizer(
			tokenizerFolder + "/tokenizer.tok-vocab.json",
			tokenizerFolder + "/tokenizer.tok-merges.txt",
		)
		tokenizerVocabSize = tokenizer.get_vocab_size()
		print("Tokenizer files have been loaded and the vocab size is %d..." % tokenizerVocabSize)
		model = load(savedModelDirectory + "/phishytics-model.joblib")
		print("Model loaded...")

		# Load document frequency dictionary
		docDict = np.load(savedModelDirectory + "/phishytics-model-tfidf-dictionary.npy", allow_pickle=True).item()
		print("Document frequency dictionary loaded...")

		# Testing
		print("Loading webpage...")
		try:
			request1 = requests.get(websiteToTest)
			webpageHtml = str(request1.text)
			webpageHtml = webpageHtml.replace("\n", " ")
		except Exception as e:
			print('\n',e)
			print("\nAn error occurred, exiting now... ")
			exit()
        
		# Convert text into feature vector
		output = tokenizer.encode(webpageHtml)
		outputDict = collections.Counter(output.ids)

		# Apply tfidf weighting
		totalFilesUnderConsideration = docDict["totalFilesUnderConsideration"]
		array = [0] * tokenizerVocabSize
		for item in outputDict:
			if len(docDict[item]) > 0:
				array[item] = (outputDict[item]) * (math.log10( totalFilesUnderConsideration / len(docDict[item])))
		predictionProbability = model.predict_proba([array])[0][1]
		print("\n****************************\n--> Probability that the website is phishing: %.2f" % (predictionProbability * 100))

		prediction = "NOT PHISHING"
		predicted_value = 0
		if predictionProbability > threshold:
			prediction = "PHISHING"
			predicted_value = 1
		print("--> Based on your threshold of %.2f, this website is +++'%s'+++" % (threshold, prediction))
		print("****************************")
		
        #print(predicted_value)
		if predicted_value == 0:    
			value = "Legitimate"
			return render_template("home.html",error=value)
		else:
			value = "Phishing"
			return render_template("home.html",error=value)
class HuggingfaceTokenizerBPE(nn.Module):
    def __init__(self, text_files, dataset_info_path='', config_data=None):
        super().__init__()
        # The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to
        # change the BERT configuration.
        vocab_size = 30000
        self.info = f'hug{vocab_size}'

        with open(f'config/data/{config_data}.json') as json_file:
            tokenizer_from = json.load(json_file)['tokenizer_from']

        config_name = config_data if tokenizer_from == "" else tokenizer_from
        print(
            os.path.join(dataset_info_path,
                         f'tokenizer_{config_name}_{vocab_size}-vocab.json'))

        # The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of
        #  CPU for no reason (it is much slower). Maybe it will be fixed in the future.
        if not os.path.isfile(
                os.path.join(
                    dataset_info_path,
                    f'tokenizer_{config_name}_{vocab_size}-vocab.json')):
            text_files = text_files()
            self.tokenizer = ByteLevelBPETokenizer()
            # Join into a single file. This should NOT be necessary but it does not work properly with a lot of files
            with open('/tmp/text_files.txt', 'wb') as outfile:
                for filename in tqdm(
                        text_files,
                        desc='Joining all files into one for tokenization'):
                    with open(filename, 'rb') as readfile:
                        shutil.copyfileobj(readfile, outfile)
                text_files = '/tmp/text_files.txt'
            self.tokenizer.train(text_files,
                                 vocab_size=vocab_size,
                                 special_tokens=special_tokens)
            self.tokenizer.save(dataset_info_path,
                                f'tokenizer_{config_name}_{vocab_size}')

        # No "else", always load for consistency
        vocab_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-vocab.json')
        merges_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-merges.txt')
        self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                               merges_file=merges_file)
        self.tokenizer.add_special_tokens(special_tokens)

        self.index_special_tokens = {
            tok: self.tokenizer.encode(tok).ids[0]
            for tok in special_tokens
        }

    @property
    def device(self):
        return self._float_tensor.device

    def encode(self, sentence: str):
        output = self.tokenizer.encode(sentence)
        token_ids = output.ids
        tokens = output.tokens
        return torch.tensor(token_ids), tokens

    def decode(self, tokens: torch.LongTensor):
        assert tokens.dim() == 1
        tokens = list(tokens.cpu().numpy())
        sentences = self.tokenizer.decode(tokens)
        return sentences

    def id_to_token(self, token_id):
        if type(token_id) != torch.Tensor:
            token_id = torch.tensor(token_id)
        return self.tokenizer.id_to_token(token_id)

    def token_to_id(self, token):
        assert type(token) == str
        return self.tokenizer.token_to_id(token)

    def __len__(self):
        return self.tokenizer.get_vocab_size()

    # This is simply for PyCharm to find the correct reference to the methods of the class
    def __call__(self, *input, **kwargs) -> typing.Any:
        return super().__call__(*input, **kwargs)