def train(cfg):
    """
    training begin
    :param cfg: config file
    :return:
    """
    datasets = build_dataset(cfg)
    algo = TFIDFClustring(cfg)
    vocab = Vocab(cfg)
    summary = SummaryTxt(cfg)
    keyword = Keyword(cfg, summary)

    processed_news_num = 0
    batch_size = cfg.SOLVER.BATCH_SIZE

    print('start training:')
    for seg_id in trange(0, datasets.file_num, batch_size):
        seg = []
        for batch_idx in range(batch_size):
            batch, seg_size = datasets.getitem(seg_id + batch_idx)
            seg.extend(batch)
            processed_news_num += seg_size

        algo.run(segments=seg,
                 vocab=vocab,
                 seg_id=seg_id,
                 keyword=keyword,
                 summary=summary)
        # keyword.update_per_seg(new_updated_topic=new_updated_topic)
        print("seg idx: {}. processed news: {}".format(seg_id,
                                                       processed_news_num))
        pass
Exemple #2
0
def load_corpus_data(data_path,
                     language_name,
                     start_token,
                     end_token,
                     mask_token,
                     vocab_path,
                     rebuild_vocab,
                     unk="UNK",
                     threshold=0):
    if rebuild_vocab:
        v = Vocab(language_name,
                  start_token,
                  end_token,
                  mask_token,
                  threshold=threshold)

    corpus = []

    with open(data_path) as f:

        data = f.read().strip().split("\n")

        for line in data:
            line = line.strip()
            line = " ".join([start_token, line, end_token])

            if rebuild_vocab:
                v.add_sentence(line)

            corpus.append(line)

    data2index = []

    if rebuild_vocab:
        v.add_unk(unk)
        v.save(vocab_path)
    else:
        v = Vocab.load(vocab_path)

    for line in corpus:
        data2index.append([v.get_index(token) for token in line.split()])

    return data2index, v
Exemple #3
0
    print(args)

    # prepare datasets and obtain the arguments

    t = datetime.datetime.now()
    timestamp = str(t.date()) + '-' + str(t.hour) + '-' + str(
        t.minute) + '-' + str(t.second)

    seed = args.seed
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    print("Loading the vocab...")
    vocab = Vocab(os.path.join(args.data_path, args.vocab_file), 3)

    trainset = HistoryDataset(data_dir=args.data_path,
                              segment_file='train_' + args.segment_file,
                              vectors_file=args.vectors_file,
                              chain_file='train_' + args.chains_file,
                              split=args.split)

    testset = HistoryDataset(data_dir=args.data_path,
                             segment_file='test_' + args.segment_file,
                             vectors_file=args.vectors_file,
                             chain_file='test_' + args.chains_file,
                             split='test')

    valset = HistoryDataset(data_dir=args.data_path,
                            segment_file='val_' + args.segment_file,
Exemple #4
0
def test_item_file(end_test_file, embedding_file_path, vocab_file_path,
                   use_gpu):
    embed = torch.Tensor(np.load(embedding_file_path)['arr_0'])
    with open(vocab_file_path) as f:
        word2id = json.load(f)
    vocab = Vocab(embed, word2id)
    #with open(end_test_file) as f:
    #    examples = [json.loads(line) for line in f]
    with open(end_test_file) as f:
        examples = list()
        for line in f:
            if line and not line.isspace():
                examples.append(json.loads(line))
    #print(examples[0])
    test_dataset = Dataset(examples)

    test_iter = DataLoader(dataset=test_dataset,
                           batch_size=args.batch_size,
                           shuffle=False)
    load_dir = os.path.join(args.input, 'model_files', 'CNN_RNN.pt')
    if use_gpu:
        checkpoint = torch.load(load_dir)
    else:
        checkpoint = torch.load(load_dir,
                                map_location=lambda storage, loc: storage)
    if not use_gpu:
        checkpoint['args'].device = None
    net = getattr(models, checkpoint['args'].model)(checkpoint['args'])
    net.load_state_dict(checkpoint['model'])
    if use_gpu:
        net.cuda()
    net.eval()
    doc_num = len(test_dataset)

    all_targets = []
    all_results = []
    all_probs = []
    all_acc = []
    all_p = []
    all_r = []
    all_f1 = []
    all_sum = []
    for batch in tqdm(test_iter):
        features, targets, summaries, doc_lens = vocab.make_features(batch)
        if use_gpu:
            probs = net(Variable(features).cuda(), doc_lens)
        else:
            probs = net(Variable(features), doc_lens)
        start = 0
        for doc_id, doc_len in enumerate(doc_lens):
            doc = batch['doc'][doc_id].split('\n')[:doc_len]
            stop = start + doc_len
            prob = probs[start:stop]
            hyp = []
            for _p, _d in zip(prob, doc):
                print(_p)
                print(_d)
                if _p > 0.5:
                    hyp.append(_d)
            if len(hyp) > 0:
                print(hyp)
                all_sum.append("###".join(hyp))
            else:
                all_sum.append('')
            all_targets.append(targets[start:stop])
            all_probs.append(prob)
            start = stop
    file_path_elems = end_test_file.split('/')
    file_name = 'TR-' + file_path_elems[len(file_path_elems) - 1]
    with open(os.path.join(args.output, file_name), mode='w',
              encoding='utf-8') as f:
        for text in all_sum:
            f.write(text.strip() + '\n')
    for item in all_probs:
        all_results.append([1 if tmp > 0.5 else 0 for tmp in item.tolist()])
    print(len(all_results))
    print(len(all_targets))
    print(len(all_probs))
    for _1, _2, _3 in zip(all_results, all_targets, all_probs):
        _2 = _2.tolist()
        _3 = _3.tolist()
        print("*" * 3)
        print('probs : ', _3)
        print('results : ', _1)
        print('targets : ', _2)
        tmp_acc = accuracy_score(_1, _2)
        tmp_p = precision_score(_1, _2)
        tmp_r = recall_score(_1, _2)
        tmp_f1 = f1_score(_1, _2)
        print('acc : ', tmp_acc)
        print('p : ', tmp_p)
        print('r : ', tmp_r)
        print('f1 : ', tmp_f1)
        all_acc.append(tmp_acc)
        all_p.append(tmp_p)
        all_r.append(tmp_r)
        all_f1.append(tmp_f1)
    print('all dataset acc : ', np.mean(all_acc))
    print('all dataset p : ', np.mean(all_p))
    print('all dataset r : ', np.mean(all_r))
    print('all dataset f1 : ', np.mean(all_f1))
    print('all results length : ', len(all_results))
Exemple #5
0
import json
from collections import defaultdict
from utils.Vocab import Vocab

with open('data/test_segments.json', 'r') as file:
    test_sg = json.load(file)

with open('data/test_chains.json', 'r') as file:
    test = json.load(file)

vocab = Vocab('data/vocab.csv', 3)

# given an img, provides the chains for which it was the target

target2chains = defaultdict(list)

for ch in test:
    target_id = ch['target']
    segment_list = ch['segments']

    target2chains[target_id].append(segment_list)

id_list = []

# segments ids, in the order in which they were encountered in the chains in the whole dataset

for c in test:
    segments = c['segments']

    for s in segments:
        if s not in id_list:
def train():
    print("*"*100)
    print("train begin")
    # use gpu
    use_gpu = args.device is not None
    if torch.cuda.is_available() and not use_gpu:
        print("WARNING: You have a CUDA device, should run with -device 0")
    if use_gpu:
        # set cuda device and seed
        torch.cuda.set_device(args.device)
    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)

    # 路径准备
    embedding_file_path = os.path.join(args.project, "embedding.npz")
    vocab_file_path = os.path.join(args.project, "word2id.json")
    end_train_file = os.path.join(args.input, "train_files", "train.txt")
    train_files_dir = os.path.join(args.input, "train_files")

    # 合并同后缀文本文件
    merge_same_suf_text_file(train_files_dir, end_train_file, '.txt')

    print('Loading vocab,train and val dataset.Wait a second,please')
    embed = torch.Tensor(np.load(embedding_file_path)['arr_0'])  # embed = torch.Tensor(list(np.load(args.embedding)))
    with open(vocab_file_path) as f:
        word2id = json.load(f)
    vocab = Vocab(embed, word2id)
    with open(end_train_file) as f:
        examples = list()
        for line in tqdm(f):
            if line and not line.isspace():
                examples.append(json.loads(line))
    train_dataset = Dataset(examples)
    print(train_dataset[:1])

    args.embed_num = embed.size(0)  # 从embeding中读取维度
    args.embed_dim = embed.size(1)  #
    args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')]
    net = getattr(models, args.model)(args, embed)
    if use_gpu:
        net.cuda()
    train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=False)
    criterion = nn.BCELoss()
    params = sum(p.numel() for p in list(net.parameters())) / 1e6
    print('#Params: %.1fM' % (params))

    min_loss = float('inf')
    optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate)
    net.train()

    t1 = time()
    for epoch in range(1, args.max_epoch + 1):
        print("*"*10, 'epoch ', str(epoch), '*'*50)
        for i, batch in enumerate(train_iter):
            print("*"*10, 'batch', i, '*'*10)
            features, targets, _, doc_lens = vocab.make_features(batch, args.seq_trunc)
            features, targets = Variable(features), Variable(targets.float())
            if use_gpu:
                features = features.cuda()
                targets = targets.cuda()
            probs = net(features, doc_lens)
            loss = criterion(probs, targets)
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm(net.parameters(), args.max_norm)
            optimizer.step()
            net.save()
            print('Epoch: %2d Loss: %f' % (epoch, loss))
    t2 = time()
    print('Total Cost:%f h' % ((t2 - t1) / 3600))
    print("模型配置文件保存至输出文件夹")