Ejemplo n.º 1
0
def handle(finput,
           fvocab_i,
           frs,
           minbsize=1,
           expand_for_mulgpu=True,
           bsize=max_sentences_gpu,
           maxpad=max_pad_tokens_sentence,
           maxpart=normal_tokens_vs_pad_tokens,
           maxtoken=max_tokens_gpu,
           minfreq=False,
           vsize=False):
    vcbi, nwordi = ldvocab(fvocab_i, minfreq, vsize)
    if expand_for_mulgpu:
        _bsize = bsize * minbsize
        _maxtoken = maxtoken * minbsize
    else:
        _bsize = bsize
        _maxtoken = maxtoken
    rsf = h5py.File(frs, 'w')
    src_grp = rsf.create_group("src")
    curd = 0
    for i_d in batch_padder(finput, vcbi, _bsize, maxpad, maxpart, _maxtoken,
                            minbsize):
        rid = numpy.array(i_d, dtype=numpy.int32)
        #rld = numpy.array(ld, dtype = numpy.int32)
        wid = str(curd)
        src_grp.create_dataset(wid, data=rid, **h5datawargs)
        #rsf["l" + wid] = rld
        curd += 1
    rsf["ndata"] = numpy.array([curd], dtype=numpy.int32)
    rsf["nword"] = numpy.array([nwordi], dtype=numpy.int32)
    rsf.close()
    print("Number of batches: %d\nSource Vocabulary Size: %d" % (curd, nwordi))
Ejemplo n.º 2
0
def data_loader(sentences_iter,
                vcbi,
                minbsize=1,
                bsize=768,
                maxpad=16,
                maxpart=4,
                maxtoken=3920):
    for i_d in batch_padder(sentences_iter, vcbi, bsize, maxpad, maxpart,
                            maxtoken, minbsize):
        yield torch.tensor(i_d, dtype=torch.long)