def iwslt15Test(start="<START>", end="<END>", unknown="<UNKNOWN>"): from VocabBuilder import VocabBuilder sourceVocab, targetVocab = VocabBuilder.iwslt15(start=start, end=end, unknown=unknown) source = [] import io with io.open("datasets/iwslt15/tst2012.en", mode="r", encoding="utf-8") as f: lines = f.readlines() source = [line.split() for line in lines] target = [] with io.open("datasets/iwslt15/tst2012.vi", mode="r", encoding="utf-8") as f: lines = f.readlines() target = [line.split() for line in lines] return (Tokens2Indices(sourceVocab=sourceVocab, targetVocab=targetVocab, source=source, target=target).map(start=None, end=None, unknown="<UNKNOWN>"), (sourceVocab[start], sourceVocab[end], sourceVocab[unknown], len(sourceVocab)), (targetVocab[start], targetVocab[end], targetVocab[unknown], len(targetVocab)))