Beispiel #1
0
    def single_decoding(self, data, model):
        self.onDecodingStart()
        for i, line in enumerate(data):
            inp = list(batch_generator(self.loader([line.strip()], self._inp_vocab), (self._inp_vocab,), 1))[0][0]
            out = model.classify(inp, **self._decoding_options)

            self.onSingleUpdate(i, inp, out)
Beispiel #2
0
    def batched_decoding(self, data, model):
        # Load data
        with open(data) as inp_fp:
            data = self.loader(inp_fp, self._inp_vocab)
        
        self.onDecodingStart()

        # Start Decoding
        output = {}
        data   = batch_generator(data, (self._inp_vocab,), batch_size=self._batch)
        ctr    = 0
        for src, src_id in data:
            trg = model.classify(src, **self._decoding_options)
            
            # Collecting output
            for src_i, trg_i, id_i in zip(src, trg, src_id):
                output[id_i] = src_i, trg_i
           
            self.onBatchUpdate(ctr, src, trg)
            ctr += len(src)
        
        self.onDecodingFinish(data, output)
Beispiel #3
0
if args.use_cpu:
    args.gpu = -1

if args.save_models:
    args.save_len = 1

""" Training """
trainer   = ParallelTrainer(args.seed, args.gpu)
 
# data
UF.trace("Loading corpus + dictionary")
with open(args.src) as src_fp:
    with open(args.trg) as trg_fp:
        SRC, TRG, train_data = load_nmt_train_data(src_fp, trg_fp, cut_threshold=args.unk_cut)
        train_data = list(batch_generator(train_data, (SRC, TRG), args.batch))
UF.trace("SRC size:", len(SRC))
UF.trace("TRG size:", len(TRG))
UF.trace("Data loaded.")

# dev data
dev_data = None
if args.src_dev and args.trg_dev:
    with open(args.src_dev) as src_fp:
        with open(args.trg_dev) as trg_fp:
            UF.trace("Loading dev data")
            _, _, dev_data = load_nmt_train_data(src_fp, trg_fp, SRC, TRG)
            dev_data = list(batch_generator(dev_data, (SRC, TRG), args.batch))
            UF.trace("Dev data loaded")

""" Setup model """
Beispiel #4
0
parser.add_argument("--model",type=str,choices=["lstm"], default="lstm", help="Type of model being trained.")
parser.add_argument("--unk_cut", type=int, default=1, help="Threshold for words in corpora to be treated as unknown.")
parser.add_argument("--dropout", type=positive_decimal, default=0.2, help="Dropout ratio for LSTM.")
parser.add_argument("--seed", type=int, default=0, help="Seed for RNG. 0 for totally random seed.")
args = parser.parse_args()

if args.use_cpu:
    args.gpu = -1

""" Training """
trainer   = ParallelTrainer(args.seed, args.gpu)

# data
UF.trace("Loading corpus + dictionary")
X, Y, data = load_pos_train_data(sys.stdin, cut_threshold=args.unk_cut)
data       = list(batch_generator(data, (X, Y), args.batch))
UF.trace("INPUT size:", len(X))
UF.trace("LABEL size:", len(Y))
UF.trace("Data loaded.")

""" Setup model """
UF.trace("Setting up classifier")
opt   = optimizers.Adam()
model = ParallelTextClassifier(args, X, Y, opt, args.gpu, activation=F.relu, collect_output=args.verbose)

""" Training Callback """
def onEpochStart(epoch):
    UF.trace("Starting Epoch", epoch+1)

def report(output, src, trg, trained, epoch):
    for index in range(len(src)):
Beispiel #5
0
 def setUp(self):
     src=["I am Philip .", "I am a student ."]
     trg=["私 は フィリップ です .", "私 は 学生 です ."]
     SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=0)
     self.model = Attentional(SRC, TRG, Args(SRC,TRG))
     self.data = batch_generator(data, (SRC, TRG), 1)
Beispiel #6
0
parser.add_argument("--unk_cut", type=int, default=1, help="Threshold for words in corpora to be treated as unknown.")
parser.add_argument("--dropout", type=positive_decimal, default=0.2, help="Dropout ratio for LSTM.")
parser.add_argument("--seed", type=int, default=0, help="Seed for RNG. 0 for totally random seed.")
parser.add_argument("--dev", type=str, help="Development data.")
args = parser.parse_args()

if args.use_cpu:
    args.gpu = -1

""" Training """
trainer   = ParallelTrainer(args.seed, args.gpu)

# data
UF.trace("Loading corpus + dictionary")
X, data    = load_lm_data(sys.stdin, cut_threshold=args.unk_cut)
data       = list(batch_generator(data, (X, X), args.batch))
UF.trace("INPUT size:", len(X))
UF.trace("Data loaded.")

# dev data
dev_data = None
if args.dev:
    with open(args.dev) as dev_fp:
        UF.trace("Loading dev data")
        _, dev_data = load_lm_data(dev_fp, X)
        dev_data = list(batch_generator(dev_data, (X, X), args.batch))
        UF.trace("Dev data loaded")

""" Setup model """
UF.trace("Setting up classifier")
opt   = optimizers.Adam()
Beispiel #7
0
        pass

def onSingleUpdate(ctr, src, trg):
    if op == "gen":
        print(VOC.str_rpr(trg[0]))
    elif op == "sppl":
        print(PPL(trg))

def onDecodingFinish(data, output):
    if op == "gen":
        for src_id, (inp, out) in sorted(output.items(), key=lambda x:x[0]):
            print(TRG.str_rpr(out))
    elif op == "cppl":
        UF.trace("Corpus PPL:", PPL(output))
        print(PPL(output))

tester = Tester(load_lm_gen_data, VOC, onDecodingStart, onBatchUpdate, onSingleUpdate, onDecodingFinish, batch=args.batch, out_vocab=VOC, options=decoding_options)
if op == "sppl" or op == "cppl":
    if not args.src:
        _, data = load_lm_data(sys.stdin, VOC)
    else:
        with open(args.src) as src_fp:
            _, data = load_lm_data(src_fp, VOC)
    data = list(batch_generator(data, (VOC, VOC), args.batch))
    tester.eval(data, model)
elif op == "gen":
    tester.test(args.src, model)
else:
    raise NotImplementedError("Undefined operation:", op)