Example #1
0
 def iwslt15Test(start="<START>", end="<END>", unknown="<UNKNOWN>"):
     from VocabBuilder import VocabBuilder
     sourceVocab, targetVocab = VocabBuilder.iwslt15(start=start,
                                                     end=end,
                                                     unknown=unknown)
     source = []
     import io
     with io.open("datasets/iwslt15/tst2012.en", mode="r",
                  encoding="utf-8") as f:
         lines = f.readlines()
         source = [line.split() for line in lines]
     target = []
     with io.open("datasets/iwslt15/tst2012.vi", mode="r",
                  encoding="utf-8") as f:
         lines = f.readlines()
         target = [line.split() for line in lines]
     return (Tokens2Indices(sourceVocab=sourceVocab,
                            targetVocab=targetVocab,
                            source=source,
                            target=target).map(start=None,
                                               end=None,
                                               unknown="<UNKNOWN>"),
             (sourceVocab[start], sourceVocab[end], sourceVocab[unknown],
              len(sourceVocab)), (targetVocab[start], targetVocab[end],
                                  targetVocab[unknown], len(targetVocab)))