Ejemplos de split2words en Python, ejemplos de alphabert_utils.split2words en Python

Ejemplo n.º 1

0

Mostrar archivo

def clean_up(src, pred_prop, clean_type=0, mean_max='mean'):
    for i, src_ in enumerate(src):
        src_split = split2words(src_)
        if clean_type == 0:
            for wordpiece in src_split.values():
                if len(wordpiece) > 1:
                    if mean_max == 'mean':
                        pred_prop[i][wordpiece] = torch.mean(
                            pred_prop[i][wordpiece])
                    else:
                        pred_prop[i][wordpiece] = torch.max(
                            pred_prop[i][wordpiece])
    return pred_prop

Ejemplo n.º 2

0

Mostrar archivo

def test_alphaBert(DS_model,
                   dloader,
                   threshold=0.5,
                   is_clean_up=True,
                   ep=0,
                   train=False,
                   mean_max='mean',
                   rouge=False,
                   parallel=parallel):
    if not train:
        DS_model.to(device)
        if parallel:
            DS_model = torch.nn.DataParallel(DS_model)
    DS_model.eval()

    out_pred_res = []
    mIOU = []
    ROC_IOU_ = []

    ROC_threshold = torch.linspace(0, 1, 100).to(device)

    all_pred_trg = {'pred': [], 'trg': []}
    rouge_set = []

    with torch.no_grad():
        for batch_idx, sample in enumerate(dloader):
            src = sample['src_token']
            trg = sample['trg']
            att_mask = sample['mask_padding']
            origin_len = sample['origin_seq_length']

            src = src.float().to(device)
            trg = trg.float().to(device)
            att_mask = att_mask.float().to(device)
            origin_len = origin_len.to(device)

            pred_prop = DS_model(x=src, x_lengths=origin_len)
            if is_clean_up:
                pred_prop = clean_up(src,
                                     pred_prop,
                                     mean_max=mean_max,
                                     tokenize_alphabets=tokenize_alphabets)

            for i, src_ in enumerate(src):
                all_pred_trg['pred'].append(pred_prop[i][:origin_len[i]].cpu())
                all_pred_trg['trg'].append(trg[i][:origin_len[i]].cpu())

                if rouge:
                    src_split, src_isword = split2words(
                        src_,
                        rouge=rouge,
                        tokenize_alphabets=tokenize_alphabets)
                    referecne = []
                    hypothesis = []
                    for j in range(len(src_split)):
                        if src_isword[j] > 0:
                            if trg[i][src_split[j]][0].cpu() > threshold:
                                referecne.append(
                                    tokenize_alphabets.convert_idx2str(
                                        src_[src_split[j]]))
                            if pred_prop[i][src_split[j]][0].cpu() > threshold:
                                hypothesis.append(
                                    tokenize_alphabets.convert_idx2str(
                                        src_[src_split[j]]))

                    rouge_set.append((hypothesis, referecne))


#            mIOU += IOU_ACC(pred_prop,trg,origin_len, threshold)
#            ROC_IOU_.append(ROC(pred_prop,trg,origin_len, ROC_threshold))

            pred_selected = pred_prop > threshold
            trg_selected = trg > threshold

            for i, src_ in enumerate(src):
                a_ = tokenize_alphabets.convert_idx2str(src_[:origin_len[i]])
                s_ = tokenize_alphabets.convert_idx2str(src_[pred_selected[i]])
                t_ = tokenize_alphabets.convert_idx2str(src_[trg_selected[i]])
                #                print(a_,pred_prop[0],s_,t_)

                out_pred_res.append((a_, s_, t_, pred_prop[0]))
            print(batch_idx, len(dloader))
    out_pd_res = pd.DataFrame(out_pred_res)
    out_pd_res.to_csv('test_pred.csv', sep=',')

    make_statistics(all_pred_trg, ep=ep)

    DS_model.train()

    if rouge:
        rouge_res = rouge12l(rouge_set)
        rouge_res_pd = pd.DataFrame(rouge_res)
        rouge_res_pd.to_csv('./iou_pic/lstm/rouge_res.csv', index=False)
        rouge_res_np = np.array(rouge_res_pd)

        pd.DataFrame(rouge_res_np.mean(axis=0)).to_csv(
            './iou_pic/lstm/rouge_res_mean.csv', index=False)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: ntuh_alphaBERT.py Proyecto: wicebing/AlphaBERT

def run_d2s(DS_model,
            dloader,
            threshold=0.51,
            mean_max = 'mean'):
    
    leading_token_idx = tokenize_alphabets.alphabet2idx['|']
    padding_token_idx = tokenize_alphabets.alphabet2idx[' ']
    out_pred_res = []
    
    with torch.no_grad():
        for batch_idx, sample in enumerate(dloader): 
            print(batch_idx)
            src = sample['src_token']
            att_mask = sample['mask_padding']
                
            src = src.float().to(device)
            att_mask = att_mask.float().to(device)
            
            bs = src.shape          
#            pred_prop_bin, = DS_model(input_ids=src,
#                                 attention_mask=att_mask,
#                                 out = 'finehead')
            pooled_output, = DS_model(input_ids=src,
                                 attention_mask=att_mask,
                                 out='finehead')
            pred_prop_bin = pooled_output.view(*bs,-1)
            
            pred_prop = clean_up_v204_ft(src,pred_prop_bin,
                                         tokenize_alphabets=tokenize_alphabets,
                                         mean_max=mean_max)   
            
            pred_prop2json = pred_prop > threshold
            pred_prop2json = pred_prop2json.int()
            
            srcstr = tokenize_alphabets.convert_idx2str(src[0])
            print('srcstr', srcstr)
#            print('00000', len(src[0]),len(pred_prop[0]),len(pred_prop2json[0]))
            for i, src_ in enumerate(src):
                hypothesis = []
                isselect_pred = False

                src_split, src_isword = split2words(src_,
                                                    tokenize_alphabets=tokenize_alphabets,
                                                    rouge=True)

                for j in range(len(src_split)):
                    if src_isword[j]>0:
                        if pred_prop[i][src_split[j]][0].cpu()>threshold:
                            hypothesis.append(tokenize_alphabets.convert_idx2str(src_[src_split[j]]))
                                      
                s_ = ''.join(i+' ' for i in hypothesis)
                
                srcstr = tokenize_alphabets.convert_idx2str(src_)
    
                out_pred_res.append([srcstr,s_,pred_prop[i].cpu().numpy()])
    
        
     
    out_np_res = np.array(out_pred_res) 
#    out_pred_BERT = pd.DataFrame(out_pred_res)
#    
#    out_json = out_pred_BERT.to_json()
#    print(out_json)
    return out_np_res

Ejemplo n.º 4

0

Mostrar archivo

def ROC4():
    filepath = '../checkpoint_exe'
    tloader = test_loaders(datapath=filepath,
                           config=config,
                           tokenize_alphabets=tokenize_alphabets,
                           num_workers=4,
                           batch_size=batch_size)

    data_np, data_pd = tloader.get_data_np()

    D2S_test = alphabert_dataset.D2Lntuh(
        data_np,
        tokenize_alphabets,
        clamp_size=config['max_position_embeddings'],
        train=False)

    D2S_testloader = DataLoader(D2S_test,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=alphabert_dataset.collate_fn)

    out_pred_res = []

    #device = 'cpu'
    modelpath = os.path.join(filepath, 'W_d2s_total_0302_947.pth')
    DS_model = torch.load(modelpath)
    DS_model.to(device)
    DS_model.eval()
    threshold = 0.96

    dloader = D2S_testloader

    out_pred_res = []
    out_trg_res = []

    all_pred_trg_ours = {'pred': [], 'trg': []}

    rouge_set = []

    leading_token_idx = tokenize_alphabets.alphabet2idx['|']
    padding_token_idx = tokenize_alphabets.alphabet2idx[' ']

    ensemble = True
    mean_max = 'mean'

    with torch.no_grad():
        for batch_idx, sample in enumerate(dloader):
            print(batch_idx)
            src = sample['src_token']
            trg = sample['trg']
            att_mask = sample['mask_padding']
            origin_len = sample['origin_seq_length']

            src = src.float().to(device)
            trg = trg.float().to(device)
            att_mask = att_mask.float().to(device)
            origin_len = origin_len.to(device)

            bs = src.shape

            pooled_output, = DS_model(input_ids=src,
                                      attention_mask=att_mask,
                                      out='finehead')
            pred_prop_bin = pooled_output.view(*bs, -1)

            pred_prop = clean_up_v204_ft(src,
                                         pred_prop_bin,
                                         tokenize_alphabets=tokenize_alphabets,
                                         mean_max=mean_max)

            pred_selected = pred_prop > threshold
            trg_selected = trg > threshold

            for i, src_ in enumerate(src):
                src_split, src_isword = split2words(
                    src_, tokenize_alphabets=tokenize_alphabets, rouge=True)
                referecne = []
                hypothesis = []
                trg_ = []
                pred_ = []

                for j in range(len(src_split)):
                    if src_isword[j] > 0:
                        if trg[i][src_split[j]][0].cpu() > threshold:
                            referecne.append(
                                tokenize_alphabets.convert_idx2str(
                                    src_[src_split[j]]))
                        if pred_prop[i][src_split[j]][0].cpu() > threshold:
                            hypothesis.append(
                                tokenize_alphabets.convert_idx2str(
                                    src_[src_split[j]]))

                        trg_.append(trg[i][src_split[j]][0].cpu())
                        pred_.append(pred_prop[i][src_split[j]][0].cpu())

                rouge_set.append((hypothesis, referecne))

                all_pred_trg_ours['trg'].append(torch.tensor(trg_))
                all_pred_trg_ours['pred'].append(torch.tensor(pred_))

                a_ = tokenize_alphabets.convert_idx2str(src_[:origin_len[i]])
                s_ = ''.join(i + ' ' for i in hypothesis)
                t_ = ''.join(i + ' ' for i in referecne)

            out_pred_res.append(s_)
            out_trg_res.append(t_)

    return all_pred_trg_ours

Ejemplo n.º 5

0

Mostrar archivo

def ROC3():
    filepath = '../checkpoint_exe'
    tloader = test_loaders_LSTM(datapath=filepath,
                                config=config,
                                tokenize_alphabets=tokenize_alphabets_lstm,
                                num_workers=4,
                                batch_size=batch_size)

    data_np, data_pd = tloader.get_data_np()

    D2S_test = alphabert_dataset.D2Lntuh(
        data_np,
        tokenize_alphabets_lstm,
        clamp_size=config['max_position_embeddings'],
        train=False)

    D2S_testloader = DataLoader(D2S_test,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=alphabert_dataset.collate_fn_lstm)

    out_pred_res = []
    # device = 'cpu'
    modelpath = os.path.join(filepath, 'W_lstm_pretrain.pth')
    DS_model = torch.load(modelpath)
    DS_model.to(device)
    DS_model.eval()
    threshold = 0.33

    dloader = D2S_testloader

    out_pred_res = []
    out_trg_res = []

    all_pred_trg_lstm = {'pred': [], 'trg': []}

    rouge_set = []

    with torch.no_grad():
        for batch_idx, sample in enumerate(dloader):
            print(batch_idx)
            src = sample['src_token']
            trg = sample['trg']
            att_mask = sample['mask_padding']
            origin_len = sample['origin_seq_length']

            src = src.float().to(device)
            trg = trg.float().to(device)
            att_mask = att_mask.float().to(device)
            origin_len = origin_len.to(device)

            pred_prop = DS_model(x=src, x_lengths=origin_len)
            if True:
                pred_prop = clean_up(
                    src,
                    pred_prop,
                    mean_max=max,
                    tokenize_alphabets=tokenize_alphabets_lstm)

            for i, src_ in enumerate(src):
                # all_pred_trg_lstm['pred'].append(pred_prop[i][:origin_len[i]].cpu())
                # all_pred_trg_lstm['trg'].append(trg[i][:origin_len[i]].cpu())

                src_split, src_isword = split2words(
                    src_,
                    tokenize_alphabets=tokenize_alphabets_lstm,
                    rouge=True)
                referecne = []
                hypothesis = []
                trg_ = []
                pred_ = []

                for j in range(len(src_split)):
                    if src_isword[j] > 0:
                        if trg[i][src_split[j]][0].cpu() > threshold:
                            referecne.append(
                                tokenize_alphabets_lstm.convert_idx2str(
                                    src_[src_split[j]]))
                        if pred_prop[i][src_split[j]][0].cpu() > threshold:
                            hypothesis.append(
                                tokenize_alphabets_lstm.convert_idx2str(
                                    src_[src_split[j]]))

                        trg_.append(trg[i][src_split[j]][0].cpu())
                        pred_.append(pred_prop[i][src_split[j]][0].cpu())

                rouge_set.append((hypothesis, referecne))

                all_pred_trg_lstm['trg'].append(torch.tensor(trg_))
                all_pred_trg_lstm['pred'].append(torch.tensor(pred_))

    return all_pred_trg_lstm

Ejemplo n.º 6

0

Mostrar archivo

Archivo: exe_code_v006.py Proyecto: wicebing/AlphaBERT

def test_alphaBert(DS_model,
                   dloader,
                   threshold=0.5,
                   is_clean_up=True,
                   ep=0,
                   train=False,
                   mean_max='mean',
                   rouge=False):
    if not train:
        DS_model.to(device)
        DS_model = torch.nn.DataParallel(DS_model)
    DS_model.eval()

    out_pred_res = []

    all_pred_trg = {'pred': [], 'trg': []}
    rouge_set = []

    with torch.no_grad():
        for batch_idx, sample in enumerate(dloader):
            src = sample['src_token']
            trg = sample['trg']
            att_mask = sample['mask_padding']
            origin_len = sample['origin_seq_length']

            src = src.float().to(device)
            trg = trg.float().to(device)
            att_mask = att_mask.float().to(device)
            origin_len = origin_len.to(device)

            bs = src.shape

            prediction_scores, (pooled_output, head_outputs) = DS_model(
                input_ids=src, attention_mask=att_mask)
            pred_prop_bin = pooled_output[0].view(*bs, -1)

            if is_clean_up:
                pred_prop = clean_up_v204_ft(
                    src,
                    pred_prop_bin,
                    tokenize_alphabets=tokenize_alphabets,
                    mean_max=mean_max)
            else:
                pred_prop_value, pred_prop = pred_prop_bin.max(dim=2)
                pred_prop = pred_prop.float()

            pred_selected = pred_prop > threshold
            trg_selected = trg > threshold

            for i, src_ in enumerate(src):
                if rouge:
                    src_split, src_isword = split2words(
                        src_,
                        tokenize_alphabets=tokenize_alphabets,
                        rouge=rouge)
                    referecne = []
                    hypothesis = []
                    trg_ = []
                    pred_ = []

                    for j in range(len(src_split)):
                        if src_isword[j] > 0:
                            if trg[i][src_split[j]][0].cpu() > threshold:
                                referecne.append(
                                    tokenize_alphabets.convert_idx2str(
                                        src_[src_split[j]]))
                            if pred_prop[i][src_split[j]][0].cpu() > threshold:
                                hypothesis.append(
                                    tokenize_alphabets.convert_idx2str(
                                        src_[src_split[j]]))

                            trg_.append(trg[i][src_split[j]][0].cpu())
                            pred_.append(pred_prop[i][src_split[j]][0].cpu())

                    rouge_set.append((hypothesis, referecne))

                    all_pred_trg['trg'].append(torch.tensor(trg_))
                    all_pred_trg['pred'].append(torch.tensor(pred_))

                    a_ = tokenize_alphabets.convert_idx2str(
                        src_[:origin_len[i]])
                    s_ = ''.join(i + ' ' for i in hypothesis)
                    t_ = ''.join(i + ' ' for i in referecne)
                else:
                    all_pred_trg['pred'].append(
                        pred_prop[i][:origin_len[i]].cpu())
                    all_pred_trg['trg'].append(trg[i][:origin_len[i]].cpu())

                    a_ = tokenize_alphabets.convert_idx2str(
                        src_[:origin_len[i]])
                    s_ = tokenize_alphabets.convert_idx2str(
                        src_[pred_selected[i]])
                    t_ = tokenize_alphabets.convert_idx2str(
                        src_[trg_selected[i]])

                out_pred_res.append((a_, s_, t_, pred_prop[i]))
            print(batch_idx, len(dloader))
    out_pd_res = pd.DataFrame(out_pred_res)
    out_pd_res.to_csv('./iou_pic/test_pred.csv', sep=',')

    make_statistics(all_pred_trg, ep=ep)

    DS_model.train()

    if rouge:
        rouge_res = rouge12l(rouge_set)
        rouge_res_pd = pd.DataFrame(rouge_res)
        rouge_res_pd.to_csv('./iou_pic/rouge_res.csv', index=False)
        rouge_res_np = np.array(rouge_res_pd)

        pd.DataFrame(rouge_res_np.mean(axis=0)).to_csv(
            './iou_pic/rouge_res_mean.csv', index=False)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: exe_code_v006.py Proyecto: wicebing/AlphaBERT

        pooled_output, = DS_model(input_ids=src,
                                  attention_mask=att_mask,
                                  out='finehead')
        pred_prop_bin = pooled_output.view(*bs, -1)

        pred_prop = clean_up_v204_ft(src,
                                     pred_prop_bin,
                                     tokenize_alphabets=tokenize_alphabets,
                                     mean_max=mean_max)

        pred_selected = pred_prop > threshold
        trg_selected = trg > threshold

        for i, src_ in enumerate(src):
            src_split, src_isword = split2words(
                src_, tokenize_alphabets=tokenize_alphabets, rouge=True)
            referecne = []
            hypothesis = []
            trg_ = []
            pred_ = []

            for j in range(len(src_split)):
                if src_isword[j] > 0:
                    if trg[i][src_split[j]][0].cpu() > threshold:
                        referecne.append(
                            tokenize_alphabets.convert_idx2str(
                                src_[src_split[j]]))
                    if pred_prop[i][src_split[j]][0].cpu() > threshold:
                        hypothesis.append(
                            tokenize_alphabets.convert_idx2str(
                                src_[src_split[j]]))