def process_text(body, origin_domain, sha): extract_urls = ExtractURL(body, origin_domain, sha) suspicious_urls = set(extract_urls.processing()) indicators = extract_urls.indicators tok = Tokenizer(body, sha) passwordlist = tok.processing() return indicators, list(suspicious_urls), passwordlist
def convert_examples_to_features(examples: List[CoupletExample], tokenizer: Tokenizer): features = [] for example in tqdm(examples, desc="creating features"): seq_ids = tokenizer.convert_tokens_to_ids(example.seq) tag_ids = tokenizer.convert_tokens_to_ids(example.tag) features.append(CoupletFeatures(seq_ids, tag_ids)) return features
def predict_demos(model, tokenizer: Tokenizer): demos = [ "马齿草焉无马齿", "天古天今,地中地外,古今中外存天地", "笑取琴书温旧梦", "日里千人拱手划船,齐歌狂吼川江号子", "我有诗情堪纵酒", "我以真诚溶冷血", "三世业岐黄,妙手回春人共赞" ] sents = [ torch.tensor(tokenizer.encode(sent)).unsqueeze(0) for sent in demos ] model.eval() device = next(model.parameters()).device for i, sent in enumerate(sents): sent = sent.to(device) with torch.no_grad(): logits = model(sent).squeeze(0) pred = logits.argmax(dim=-1).tolist() pred = tokenizer.decode(pred) logger.info(f"上联:{demos[i]}。 预测的下联:{pred}")
def run(): args = get_args() fdir = Path(args.dir) tb = SummaryWriter(args.logdir) # 对啦,tensorboard画图的 device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") output_dir = Path(args.output) output_dir.mkdir(exist_ok=True, parents=True) logger.info(args) logger.info(f"loading vocab...") tokenizer = Tokenizer.from_pretrained(fdir / 'vocab.pkl') logger.info(f"loading dataset...") train_dataset = torch.load(fdir / 'train.pkl') test_dataset = torch.load(fdir / 'test.pkl') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size) logger.info(f"initializing model...") model = init_model_by_key(args, tokenizer) model.to(device) loss_function = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_id) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.fp16: try: from apex import amp # 实现不同程度的混合精度加速,提升pytorch的训练速度 amp.register_half_function( torch, 'einsum' ) # 某些不常用的函数,在使用前需要注册;某些函数(如einsum)暂不支持FP16加速,建议不要用的太heavy model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min') # 当网络的评价指标不在提升的时候,可以通过降低网络的学习率来提高网络性能。 logger.info(f"num gpu: {torch.cuda.device_count()}") global_step = 0 for epoch in range(args.epochs): logger.info(f"***** Epoch {epoch} *****") model.train() t1 = time.time() accu_loss = 0.0 for step, batch in enumerate(train_loader): optimizer.zero_grad() batch = tuple(t.to(device) for t in batch) input_ids, masks, lens, target_ids = batch logits = model(input_ids, masks) loss = loss_function(logits.view(-1, tokenizer.vocab_size), target_ids.view(-1)) if torch.cuda.device_count() > 1: loss = loss.mean() accu_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() if step % 100 == 0: tb.add_scalar('loss', loss.item(), global_step) # tensorboard画图用的 logger.info( f"[epoch]: {epoch}, [batch]: {step}, [loss]: {loss.item()}" ) global_step += 1 scheduler.step(accu_loss) t2 = time.time() logger.info( f"epoch time: {t2-t1:.5}, accumulation loss: {accu_loss:.6}") if (epoch + 1) % args.test_epoch == 0: predict_demos(model, tokenizer) bleu, rl = auto_evaluate(model, test_loader, tokenizer) logger.info(f"BLEU: {round(bleu, 9)}, Rouge-L: {round(rl, 8)}") if (epoch + 1) % args.save_epoch == 0: filename = f"{model.__class__.__name__}_{epoch + 1}.bin" filename = output_dir / filename save_model(filename, model, args, tokenizer)
dataset = TensorDataset(*tensors) return dataset if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--input", default='couplet', type=str) parser.add_argument("--output", default='dataset', type=str) parser.add_argument("--max_seq_len", default=32, type=int) args = parser.parse_args() input_dir = Path(args.input) output_dir = Path(args.output) output_dir.parent.mkdir(exist_ok=True, parents=True) vocab_file = input_dir / "vocabs" logger.info("creating tokenizer...") tokenizer = Tokenizer() tokenizer.build(vocab_file) logger.info("creating dataset...") train_dataset = create_dataset(input_dir / "train", tokenizer, args.max_seq_len) test_dataset = create_dataset(input_dir / "test", tokenizer, args.max_seq_len) logger.info("saving dataset...") tokenizer.save_pretrained(output_dir / "vocab.pkl") torch.save(train_dataset, output_dir / "train.pkl") torch.save(test_dataset, output_dir / "test.pkl")
examine_headers = ExamineHeaders(msg) origin_ip, rbl_listed, rbl_comment, mailfrom, mailto, origin_domain = examine_headers.processing() indicators += examine_headers.indicators attachements = [] payload_results = [] suspicious_urls = set() if msg.content_type.is_multipart(): for p in msg.walk(): extract_urls = ExtractURL(p.body, origin_domain) suspicious_urls |= set(extract_urls.processing()) indicators += extract_urls.indicators if p.is_body(): content = p.body tok = Tokenizer(content) passwordlist += tok.processing() # TODO process that string elif p.is_attachment() or p.is_inline(): content_type = p.detected_content_type filename = p.detected_file_name attachements.append((filename, content_type)) if filename is not None and len(filename) > 0: passwordlist.append(filename) prefix, suffix = os.path.splitext(filename) passwordlist.append(prefix) r, r_indicators = process_payload(filename, p.body, content_type, origin_domain, passwordlist) indicators += r_indicators payload_results.append(r) else: # What do we do there? Is it possible?
origin_ip, rbl_listed, rbl_comment, mailfrom, mailto, origin_domain = examine_headers.processing( ) indicators += examine_headers.indicators attachements = [] payload_results = [] suspicious_urls = set() if msg.content_type.is_multipart(): for p in msg.walk(): extract_urls = ExtractURL(p.body, origin_domain) suspicious_urls |= set(extract_urls.processing()) indicators += extract_urls.indicators if p.is_body(): content = p.body tok = Tokenizer(content) passwordlist += tok.processing() # TODO process that string elif p.is_attachment() or p.is_inline(): content_type = p.detected_content_type filename = p.detected_file_name attachements.append((filename, content_type)) if filename is not None and len(filename) > 0: passwordlist.append(filename) prefix, suffix = os.path.splitext(filename) passwordlist.append(prefix) r, r_indicators = process_payload(filename, p.body, content_type, origin_domain, passwordlist) indicators += r_indicators payload_results.append(r)