Python load_dataset Beispiele, abstractive.data_loader.load_dataset Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: train_abstractive.py Projekt: zhanhl316/hiersumm

def test(args, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"

    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])

    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    spm = sentencepiece.SentencePieceProcessor()
    spm.Load(args.vocab_path)
    word_padding_idx = spm.PieceToId('<PAD>')
    symbols = {'BOS': spm.PieceToId('<S>'), 'EOS': spm.PieceToId('</S>'), 'PAD': word_padding_idx,
               'EOT': spm.PieceToId('<T>'), 'EOP': spm.PieceToId('<P>'), 'EOQ': spm.PieceToId('<Q>')}

    vocab_size = len(spm)
    vocab = spm
    model = Summarizer(args, word_padding_idx, vocab_size, device, checkpoint)
    model.eval()

    test_iter = data_loader.AbstractiveDataloader(args, load_dataset(args, args.dataset, shuffle=False), symbols,
                                                  args.valid_batch_size, device, shuffle=False, is_test=True)
    predictor = build_predictor(args, vocab, symbols, model, logger=logger)
    predictor.translate(test_iter, step)

Beispiel #2

0

Datei anzeigen

Datei: train_abstractive.py Projekt: zhanhl316/hiersumm

def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)


    opt = vars(checkpoint['opt'])

    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    spm = sentencepiece.SentencePieceProcessor()
    spm.Load(args.vocab_path)
    word_padding_idx = spm.PieceToId('<PAD>')
    symbols = {'BOS': spm.PieceToId('<S>'), 'EOS': spm.PieceToId('</S>'), 'PAD': word_padding_idx,
               'EOT': spm.PieceToId('<T>'), 'EOP': spm.PieceToId('<P>'), 'EOQ': spm.PieceToId('<Q>')}

    vocab_size = len(spm)
    model = Summarizer(args, word_padding_idx, vocab_size, device, checkpoint)
    model.eval()

    valid_iter = data_loader.AbstractiveDataloader(args, load_dataset(args, 'valid', shuffle=False), symbols,
                                                   args.batch_size, device, shuffle=False, is_test=False)

    trainer = build_trainer(args, device_id, model, symbols, vocab_size, None)
    stats = trainer.validate(valid_iter)
    trainer._report_step(0, step, valid_stats=stats)
    return stats.ppl()

Beispiel #3

0

Datei anzeigen

Datei: train_abstractive.py Projekt: aiswaryasankar/hiersumm

 def train_iter_fct():
     return data_loader.AbstractiveDataloader(args,
                                              load_dataset(args,
                                                           'train',
                                                           shuffle=True),
                                              symbols,
                                              args.batch_size,
                                              device,
                                              shuffle=True,
                                              is_test=False)

Beispiel #4

0

Datei anzeigen

Datei: train_abstractive.py Projekt: ramakanth-pasunuru/QmdsCnnIr

 def valid_iter_fct():
     return data_loader.AbstractiveDataloader(args,
                                              load_dataset(args,
                                                           'valid',
                                                           shuffle=False),
                                              symbols,
                                              args.valid_batch_size,
                                              device,
                                              shuffle=True,
                                              is_test=True)

Beispiel #5

0

Datei anzeigen

def stats():
    device = "cpu"
    spm = sentencepiece.SentencePieceProcessor()
    spm.Load(FLAGS.vocab_path)
    word_padding_idx = spm.PieceToId('<PAD>')
    symbols = {
        'BOS': spm.PieceToId('<S>'),
        'EOS': spm.PieceToId('</S>'),
        'PAD': word_padding_idx,
        'EOT': spm.PieceToId('<T>'),
        'EOP': spm.PieceToId('<P>'),
        'EOQ': spm.PieceToId('<Q>')
    }

    if (FLAGS.dataset == ''):
        dataset = ['train', 'valid', 'test']
    else:
        dataset = [FLAGS.dataset]

    src_l, tgt_l = 0, 0
    n = 0
    for d in dataset:
        iter = data_loader.AbstractiveDataloader(load_dataset(d,
                                                              shuffle=False),
                                                 symbols,
                                                 FLAGS.valid_batch_size,
                                                 device,
                                                 shuffle=False,
                                                 is_test=True)

        for b in iter:
            src_l += sum([int(i) for i in b.src_length])
            tgt_l += b.tgt.size(0) * b.tgt.size(1) - int(torch.sum(b.tgt == 6))
            n += int(b.tgt.size(1))
        # print(nnn / 3800)
        #
        #
        logger.info('n %d' % n)
        logger.info('src_l %f' % (1.0 * src_l / n))
        logger.info('tgt_l %f' % (1.0 * tgt_l / n))

Beispiel #6

0

Datei anzeigen

Datei: train_abstractive.py Projekt: aiswaryasankar/hiersumm

def test(args, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"

    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])

    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    spm = sentencepiece.SentencePieceProcessor()
    spm.Load(args.vocab_path)
    word_padding_idx = spm.PieceToId('<PAD>')
    symbols = {
        'BOS': spm.PieceToId('<S>'),
        'EOS': spm.PieceToId('</S>'),
        'PAD': word_padding_idx,
        'EOT': spm.PieceToId('<T>'),
        'EOP': spm.PieceToId('<P>'),
        'EOQ': spm.PieceToId('<Q>')
    }

    vocab_size = len(spm)
    vocab = spm
    model = Summarizer(args, word_padding_idx, vocab_size, device, checkpoint)
    model.eval()

    data = [
        "President-elect Joe Biden criticized the Trump administration Tuesday for the pace of distributing COVID-19 vaccines and predicted that “things will get worse before they get better” when it comes to the pandemic. Biden encouraged Americans to “steel our spines” for challenges to come and predicted that “things are going to get worse before they get better.” Earlier this month, Trump administration officials said they planned to have 20 million doses of the vaccine distributed by the end of the year. At the current pace, Biden said, “it’s gonna take years, not months, to vaccinate the American people.",
        "The Second Stimulus Answers to Your Questions About the Stimulus Bill Updated Dec 30, 2020 The economic relief package will issue payments of $600 and distribute a federal unemployment benefit of $300 for at least 10 weeks. To receive assistance, households will have to meet several conditions: Household income (for 2020) cannot exceed more than 80 percent of the area median income; at least one household member must be at risk of homelessness or housing instability; and individuals must qualify for unemployment benefits or have experienced financial hardship — directly or indirectly — because of the pandemic. Robert Van Sant’s unemployment benefits of $484 a week don’t cover his monthly expenses of $2,200 in rent, utilities, internet access, food and other necessities.",
        "The majority leader, who effectively ruled out consideration of the House version stimulus check bill, has sought to couple the stimulus boost with unrelated provisions aimed at placating Trump’s demands to address legal protections for tech companies and his unsubstantiated claims of voter fraud in the 2020 election. McConnell said the House bill on the stimulus checks “has no realistic path to quickly pass the Senate,” even as Trump continues to harangue McConnell and GOP leaders over their refusal to go along with his demands. Sen. John Cornyn (R-Texas) predicted the Senate would follow the House and override Trump’s veto of the defense bill, calling it a matter of “how long people want to extend this out.",
        "The Second Stimulus Answers to Your Questions About the Stimulus Bill Updated Dec 30, 2020 The economic relief package will issue payments of $600 and distribute a federal unemployment benefit of $300 for at least 10 weeks. To receive assistance, households will have to meet several conditions: Household income (for 2020) cannot exceed more than 80 percent of the area median income; at least one household member must be at risk of homelessness or housing instability; and individuals must qualify for unemployment benefits or have experienced financial hardship — directly or indirectly — because of the pandemic. Robert Van Sant’s unemployment benefits of $484 a week don’t cover his monthly expenses of $2,200 in rent, utilities, internet access, food and other necessities.",
        "House Armed Services Chair Adam Smith (D-Wash.) told reporters last week that lawmakers’ “only option” if an override fails will be to attempt to pass the same defense agreement after President-elect Joe Biden takes office. Trump has also pushed for lawmakers to use the defense bill to repeal legal protections for social media companies, known as Section 230, but was rebuffed as Republicans and Democrats alike said it fell outside of the Armed Services Committee’s jurisdiction. Both the House and Senate passed a compromise defense bill last week with more than enough votes to overcome a veto, including strong support from Republicans.",
        "Top officials, including Speaker Nancy Pelosi and Mnuchin, are privately discussing contingency plans such as a stopgap spending bill if Trump does formally veto the measure by Monday, when funding is set to lapse. But it’s not clear how long that stopgap measure would last — or whether Trump would sign it, if it makes none of the changes he’s demanded, such as cuts to foreign aid, according to people familiar with the discussions. “Today, on Christmas Eve morning, House Republicans cruelly deprived the American people of the $2,000 that the President agreed to support,” Pelosi said in a statement.",
        "McConnell has also expressed concern that the vote could hurt GOP senators facing tough general election fights by alienating moderate voters. According to multiple people familiar with the discussion, the Senate GOP leader also asked Hawley several times to walk through how his objection would play out. The Missouri senator has focused his objections on Pennsylvania, arguing that it and other states failed to adhere to their own election laws. A Toomey spokesperson confirmed the account, saying: “Sen. Toomey made his views on Senator Hawley’s planned objection clear. Hawley instead sent an email to Senate Republicans after the call wrapped.",
        "President Trump spurned Democrats and Republicans alike on Wednesday as he left Washington defiantly for the holidays, vetoing a major defense bill, imperiling a COVID-19 relief package and setting the stage for a possible government shutdown after Christmas. Mr. Trump vetoed the $740 billion National Defense Authorization Act as promised, citing among his objections that lawmakers didn’t honor his late demand to repeal legal liability protections for big social media companies such as Twitter, Google and Facebook. Senate Armed Services Committee Chairman James Inhofe, Oklahoma Republican, said the president’s valid concerns about Big Tech companies shouldn’t derail the annual defense bill that sets national security priorities."
    ]

    dataset_list = []
    mds = []

    for article in data:
        encoded = spm.Encode(article)
        mds.append(encoded)

    dataset_list = [{"src": mds, "tgt": [], "tgt_str": []}]
    # df = pd.DataFrame(dataset_list)
    # dataset = MultiNewsDataset(df)
    # dataloader = DataLoader(dataset, batch_size=1)

    # load_dataset(args, args.dataset, shuffle=False)
    # test_iter = data_loader.AbstractiveDataloader(args, load_dataset_live(dataset_list), symbols,
    #                                               args.valid_batch_size, device, shuffle=False, is_test=True)

    test_iter = data_loader.AbstractiveDataloader(args,
                                                  load_dataset(args,
                                                               args.dataset,
                                                               shuffle=False),
                                                  symbols,
                                                  args.valid_batch_size,
                                                  device,
                                                  shuffle=False,
                                                  is_test=True)
    predictor = build_predictor(args, vocab, symbols, model, logger=logger)
    predictor.translate(test_iter, step, spm)

Beispiel #7

0

Datei anzeigen

def baseline():
    device = "cpu"

    spm = sentencepiece.SentencePieceProcessor()
    spm.Load(FLAGS.vocab_path)
    word_padding_idx = spm.PieceToId('<PAD>')
    symbols = {
        'BOS': spm.PieceToId('<S>'),
        'EOS': spm.PieceToId('</S>'),
        'PAD': word_padding_idx,
        'EOT': spm.PieceToId('<T>'),
        'EOP': spm.PieceToId('<P>'),
        'EOQ': spm.PieceToId('<Q>')
    }

    if (FLAGS.dataset == ''):
        dataset = ['train', 'valid', 'test']
    else:
        dataset = [FLAGS.dataset]

    for d in dataset:
        iter = data_loader.AbstractiveDataloader(load_dataset(d,
                                                              shuffle=False),
                                                 symbols,
                                                 FLAGS.valid_batch_size,
                                                 device,
                                                 shuffle=False,
                                                 is_test=True)

        lead_save = open(
            os.path.join(FLAGS.result_path, 'baseline_output_' + d + '.lead'),
            'w')
        gold_save = open(
            os.path.join(FLAGS.result_path, 'baseline_output_' + d + '.gold'),
            'w')
        len_lst = [5, 10, 20, 40]
        lead_para_save = [
            open(
                os.path.join(FLAGS.result_path,
                             'baseline_output_para_%d_' % i + d + '.lead'),
                'w') for i in len_lst
        ]
        nnn = 0
        for b in iter:
            for i in range(len(b)):
                src = [
                    spm.DecodeIds([int(t) for t in b.src[i][j]
                                   ]).split('<T>')[-1].replace(
                                       '<P>',
                                       '').replace('</S>',
                                                   '').replace('<PAD>',
                                                               '').strip()
                    for j in range(b.src.size(1))
                ]
                # src = spm.DecodeIds([int(t) for t in [b.src[i][j] for j in range(b.src.size(1))]]).split('<T>')[-1].replace(
                #     '<P>', '').replace('</S>','').replace('<PAD>', '').strip()
                # src = src[1:]
                # print([len(p.split()) for p in src])
                for j, l in enumerate(len_lst):
                    lead_para_save[j].write(' '.join(src[:l]) + '\n')
                tgt = b.tgt_str[i].replace('<t>', '').replace('</t>',
                                                              '').split()
                lead = ' '.join(src).split()[:len(tgt)]
                lead_save.write(' '.join(lead) + '\n')
                gold_save.write(' '.join(tgt) + '\n')
        # print(nnn / 3800)
        lead_save.close()
        gold_save.close()
        [f.close() for f in lead_para_save]
        logger.info('Saved lead results to %s' %
                    os.path.realpath(lead_save.name))
        logger.info('Saved gold results to %s' %
                    os.path.realpath(gold_save.name))