Example #1
0
    def dataset_initialize(self):
        print("-- Preparing Data --")

        self.transform = [
            transforms.Resize((256, 256), Image.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]

        self.train_data_loader = dataset(root_dir=self.args.DATA.data_path,
                                         dataset=self.args.DATA.dataset,
                                         mode='train',
                                         direction=self.args.DATA.direction,
                                         transform=self.transform)

        self.val_data_loader = dataset(root_dir=self.args.DATA.data_path,
                                       dataset=self.args.DATA.dataset,
                                       mode='val',
                                       direction=self.args.DATA.direction,
                                       transform=self.transform)

        self.train_loader = DataLoader(
            self.train_data_loader,
            batch_size=self.args.DATA.batch_size,
            shuffle=True,
            num_workers=self.args.SETTINGS.num_workers,
            drop_last=True)

        self.val_loader = DataLoader(
            self.val_data_loader,
            batch_size=self.args.DATA.batch_size,
            shuffle=True,
            num_workers=self.args.SETTINGS.num_workers,
            drop_last=True)

        print("-- Dataset DONE --")
Example #2
0
def make_data(text, title, authors, label):

    text_data = []
    for each_x in text:
        text_data.append(make_text_data1(each_x['text_file'], each_x['text_dict'], \
            each_x['doc_len'], each_x['text_len']))
    text_data.append(
        make_text_data2(title['text_file'], title['text_dict'],
                        title['text_len']))
    text_data.append(
        make_text_data2(authors['text_file'],
                        authors['text_dict'],
                        authors['text_len'],
                        sep=','))
    label_data = make_label_data(label['label_file'], label['label_dict'])

    return dataset(text_data, label_data)
Example #3
0
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False, char=False):
    src, tgt = [], [] 
    raw_src, raw_tgt = [], [] 
    sizes = [] 
    count, ignored = 0, 0

    print('Processing %s & %s ...' % (srcFile, tgtFile))
    srcF = open(srcFile)
    tgtF = open(tgtFile)

    while True: 
        sline = srcF.readline()
        tline = tgtF.readline()

        # normal end of file
        if sline == "" and tline == "":
            break

        # source or target does not have same number of lines
        if sline == "" or tline == "":
            print('WARNING: source and target do not have the same number of sentences')
            break

        sline = sline.strip()
        tline = tline.strip()

        # source and/or target are empty
        if sline == "" or tline == "":
            print('WARNING: ignoring an empty line ('+str(count+1)+')')
            ignored += 1
            continue

        if opt.lower:
            sline = sline.lower()
            tline = tline.lower()

        srcWords = sline.split()
        tgtWords = tline.split()

        # 
        if opt.src_length == 0 or (len(srcWords) <= opt.src_length and len(tgtWords) <= opt.tgt_length):

            if char:
                srcWords = [word + " " for word in srcWords]
                tgtWords = list(" ".join(tgtWords))
            else:
                srcWords = [word+" " for word in srcWords]
                tgtWords = [word+" " for word in tgtWords]

            src += [srcDicts.convertToIdx(srcWords,
                                          dict.UNK_WORD)] 
            tgt += [tgtDicts.convertToIdx(tgtWords,
                                          dict.UNK_WORD,
                                          dict.BOS_WORD,
                                          dict.EOS_WORD)]
            raw_src += [srcWords]
            raw_tgt += [tgtWords]
            sizes += [len(srcWords)] 
        else:
            ignored += 1

        count += 1

        if count % opt.report_every == 0:
            print('... %d sentences prepared' % count)

    srcF.close()
    tgtF.close()

    if opt.shuffle == 1:
        print('... shuffling sentences')
        perm = torch.randperm(len(src))
        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]
        sizes = [sizes[idx] for idx in perm]
        raw_src = [raw_src[idx] for idx in perm]
        raw_tgt = [raw_tgt[idx] for idx in perm]

    if sort:
        print('... sorting sentences by size')
        _, perm = torch.sort(torch.Tensor(sizes))
        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]
        raw_src = [raw_src[idx] for idx in perm]
        raw_tgt = [raw_tgt[idx] for idx in perm]

    print('Prepared %d sentences (%d ignored due to length == 0 or > %d)' %
          (len(src), ignored, opt.src_length))

    return dataset(src, tgt, raw_src, raw_tgt) 
Example #4
0
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False, char=False):
    '''
    根据字典将语言文件的单词转为索引值.
    Inputs:
        srcFile: str, 源语言文件的文件名
        tgtFile: str, 目标语言文件的文件名
        srcDicts: str, 已经做好的源语言的字典
        tgtDicts: str, 已经做好的目标语言的字典
        sort: bool, 是否排序
        char: bool, 是否以字符为单位
    Return:
        dataset: 已经封装好的数据集, 含有src, tgt, raw_src, raw_tgt
        src: list, 每一个元素为一个样本的Torch.LongTensor, 值为索引.
        tgt: list, 每一个元素为一个样本的Torch.LongTensor, 值为索引.
        raw_src: list, 每一个元素也为list, 该样本的单词.
        raw_tgt: list, 每一个元素也为list, 该样本的单词.
    '''

    src, tgt = [], []  # src, tgt是一个列表,每一个元素是一行索引
    raw_src, raw_tgt = [], []  # raw_src, raw_tgt是一个列表,每一个元素为一行单词
    sizes = []  # 每个元素表示该样本含有的单词数量.
    count, ignored = 0, 0

    print('Processing %s & %s ...' % (srcFile, tgtFile))
    srcF = open(srcFile)
    tgtF = open(tgtFile)

    while True:  # 每次读取一行
        sline = srcF.readline()
        tline = tgtF.readline()

        # normal end of file
        if sline == "" and tline == "":
            break

        # source or target does not have same number of lines
        if sline == "" or tline == "":
            print(
                'WARNING: source and target do not have the same number of sentences'
            )
            break

        sline = sline.strip()
        tline = tline.strip()

        # source and/or target are empty
        if sline == "" or tline == "":
            print('WARNING: ignoring an empty line (' + str(count + 1) + ')')
            ignored += 1
            continue

        if opt.lower:
            sline = sline.lower()
            tline = tline.lower()

        srcWords = sline.split()
        tgtWords = tline.split()

        #
        if opt.src_length == 0 or (len(srcWords) <= opt.src_length
                                   and len(tgtWords) <= opt.tgt_length):

            if char:
                srcWords = [word + " " for word in srcWords]
                tgtWords = list(" ".join(tgtWords))
            else:
                srcWords = [word + " " for word in srcWords[:300]]
                tgtWords = [word + " " for word in tgtWords]

            src += [srcDicts.convertToIdx(srcWords,
                                          dict.UNK_WORD)]  # src不用添加BOS和EOS
            tgt += [
                tgtDicts.convertToIdx(tgtWords, dict.UNK_WORD, dict.BOS_WORD,
                                      dict.EOS_WORD)
            ]
            raw_src += [srcWords]
            raw_tgt += [tgtWords]
            sizes += [len(srcWords)]  # sizes对应的是每一行src文本的单词的个数
        else:
            ignored += 1

        count += 1

        if count % opt.report_every == 0:
            print('... %d sentences prepared' % count)

    srcF.close()
    tgtF.close()

    if opt.shuffle == 1:
        print('... shuffling sentences')
        perm = torch.randperm(len(src))
        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]
        sizes = [sizes[idx] for idx in perm]
        raw_src = [raw_src[idx] for idx in perm]
        raw_tgt = [raw_tgt[idx] for idx in perm]

    if sort:
        print('... sorting sentences by size')
        _, perm = torch.sort(torch.Tensor(sizes))
        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]
        raw_src = [raw_src[idx] for idx in perm]
        raw_tgt = [raw_tgt[idx] for idx in perm]

    print('Prepared %d sentences (%d ignored due to length == 0 or > %d)' %
          (len(src), ignored, opt.src_length))

    return dataset(src, tgt, raw_src, raw_tgt)  # 封装成相应的数据集
Example #5
0
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False, char=False):
    src, tgt, from_train = [], [], []
    raw_src, raw_tgt = [], []
    sizes = []
    count, ignored = 0, 0

    print('Processing %s & %s ...' % (srcFile, tgtFile))
    srcF = open(srcFile, encoding='utf-8')
    tgtF = open(tgtFile, encoding='utf-8')

    not_in_train_cnt = 0

    while True:
        sline = srcF.readline()
        tline = tgtF.readline()

        # normal end of file
        if sline == "" and tline == "":
            break

        # source or target does not have same number of lines
        if sline == "" or tline == "":
            print(
                'WARNING: source and target do not have the same number of sentences'
            )
            break

        sline = sline.strip()
        tline = tline.strip()

        # source and/or target are empty
        if sline == "" or tline == "":
            print('WARNING: ignoring an empty line (' + str(count + 1) + ')')
            ignored += 1
            continue

        if opt.lower:
            sline = sline.lower()
            tline = tline.lower()

        srcWords = sline.split()
        tgtWords = tline.split()

        # 句长在限制范围内
        if opt.src_length == 0 or (len(srcWords) <= opt.src_length
                                   and len(tgtWords) <= opt.tgt_length):

            if char:
                srcWords = [word + " " for word in srcWords]
                tgtWords = list(" ".join(tgtWords))
            else:
                srcWords = [word + " " for word in srcWords]
                tgtWords = [word + " " for word in tgtWords]

            # 以id组成的句子
            src += [srcDicts.convertToIdx(srcWords, dict.UNK_WORD)]
            # target句加入了GO和EOS
            tgt += [
                tgtDicts.convertToIdx(tgtWords, dict.UNK_WORD, dict.BOS_WORD,
                                      dict.EOS_WORD)
            ]
            # 原句(当然前面的lowercase还是在的)
            raw_src += [srcWords]
            raw_tgt += [tgtWords]
            sizes += [len(srcWords)]

            in_train = True
            for word in new_label + ['<unk>']:
                if word in map(lambda x: x.strip(), tgtWords):
                    in_train = False
                    not_in_train_cnt += 1
            from_train += [[in_train]]

        else:
            ignored += 1

        count += 1

        if count % opt.report_every == 0:
            print('... %d sentences prepared' % count)

    srcF.close()
    tgtF.close()

    print(not_in_train_cnt)

    if opt.shuffle == 1:
        print('... shuffling sentences')
        perm = torch.randperm(len(src))
        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]
        sizes = [sizes[idx] for idx in perm]
        raw_src = [raw_src[idx] for idx in perm]
        raw_tgt = [raw_tgt[idx] for idx in perm]
        from_train = [from_train[idx] for idx in perm]

    if sort:
        print('... sorting sentences by size')
        _, perm = torch.sort(torch.Tensor(sizes))
        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]
        raw_src = [raw_src[idx] for idx in perm]
        raw_tgt = [raw_tgt[idx] for idx in perm]
        from_train = [from_train[idx] for idx in perm]

    print('Prepared %d sentences (%d ignored due to length == 0 or > %d)' %
          (len(src), ignored, opt.src_length))

    new_id = [57, 58, 59, 60, 56, 1]
    for i in range(len(from_train)):
        flag = 0
        for label in tgt[i]:
            if label in new_id:
                flag = 1
        if flag:
            assert from_train[i][0] is not True
        else:
            assert from_train[i][0] is True

    dtst = dataset(src, tgt, raw_src, raw_tgt, from_train)

    print('len tgt {}, known {}'.format(len(dtst.tgt), len(dtst.from_known)))

    for i in range(len(dtst.from_known)):
        flag = 0
        for label in dtst.tgt[i]:
            if label in new_id:
                flag = 1
        if flag:
            print(i, dtst.tgt[i], dtst.from_known[i][0])
            assert dtst.from_known[i][0] is not True
        else:
            print(i, dtst.tgt[i],
                  dtst.from_known[i][0])  # hint: 发现train只有10534
            assert dtst.from_known[i][0] is True
    # import time
    # time.sleep(10)

    return dtst
Example #6
0
def test(args):
    device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
    transform = [transforms.Resize((256, 256), Image.BICUBIC),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]

    test_data_loader = dataset(
                            root_dir = args.DATA.data_path,
                            dataset = args.DATA.dataset,
                            mode = 'test',
                            direction = args.DATA.direction,
                            transform=transform)


    test_loader = DataLoader(test_data_loader,
                              batch_size=1,
                              shuffle=True,
                              num_workers=4)

    "Define Models"
    generator = Generator(args.MODEL.ngf,
                          args.MODEL.input_nc,
                          args.MODEL.output_nc)
    "Load weights"
    checkpoint = args.EVALUATION.evaluation_path
    checkpoint = os.path.join(checkpoint, "best.pth.tar")
    load(generator, checkpoint)
    generator.to(device)
    L1_loss = torch.nn.L1Loss()


    t_loss = 0
    generator.eval()

    for i, batch in enumerate(test_loader):
        input = batch['input'].to(device)
        target = batch['target'].to(device)

        "Discriminator Training"
        with torch.no_grad():
            output = generator.forward(input)
        test_loss = L1_loss(output,target)

        t_loss += test_loss



        if args.EVALUATION.plot:
            "Display Images"
            input = input.detach().cpu().numpy()
            input = input.squeeze().transpose((1, 2, 0))

            output = output.detach().cpu().numpy()
            output = output.squeeze().transpose((1, 2, 0))

            fig = plt.figure()
            ax1 = fig.add_subplot(1, 2, 1)
            ax1.set_title('input')
            ax1.imshow(input)

            ax2 = fig.add_subplot(1, 2, 2)
            ax2.set_title('output')
            ax2.imshow(output)
            plt.pause(1)

        print("Iter:", i, "L1 loss:", test_loss.item())
    print("Final Test Loss: ", t_loss.item()/len(test_loader))
Example #7
0
def main(args):
    train_loader, test_loader = dataset()
    model_obj = STNTrain(train_loader, test_loader, learning_rate=0.001)
    model_obj.train(25, args.checkpoint_path, args.export_path)
Example #8
0
def makeData(qFile, rFile, Dicts):
    qidx, ridx = [], []
    raw_src, raw_tgt = [], []
    sizes = []
    count, ignored = 0, 0

    print('Processing %s & %s ...' % (qFile, rFile))
    qF = open(qFile, 'r', encoding='utf-8')
    aF = open(rFile, 'r', encoding='utf-8')

    # while True:
    # qline = qF.readline()
    # aline = aF.readline()
    for aline in aF.readlines():
        # print(len(qline))

        # normal end of file
        # if  len(aline) == 0:
        #     break

        # # source or target does not have same number of lines
        # if len(qline) == 0 or len(aline) == 0:
        #     print('WARNING: source and target do not have the same number of sentences')
        #     break

        # qline = qline.strip()
        aline = aline.strip()
        # print(qline,aline)
        # q and/or r are empty
        # if qline == "" or aline == "":
        #     print('WARNING: ignoring an empty line ('+str(count+1)+')')
        #     ignored += 1
        # continue

        # qWords = qline.decode('utf-8').split()
        aWords = aline.split()
        # print(qWords,aWords)
        # print(qWords)
        #
        # if opt.q_length == 0 or (len(aWords) <= opt.a_length):
        # qWords = [word+" " for word in qWords]
        aWords = [word + " " for word in aWords]

        # qidx += [Dicts.convertToIdx(qWords,
        # dict.UNK_WORD)]
        aWords = Dicts.convertToIdx(aWords, dict.UNK_WORD)
        if len(aWords) < opt.a_length:
            aWords.extend([0] * (opt.a_length - len(aWords)))
        else:
            aWords = aWords[-opt.a_length:]
        ridx.append(aWords)
        # raw_src += [srcWords]
        # raw_tgt += [tgtWords]
        # sizes += [len(qWords)]
        # else:
        #     ignored += 1

        # count += 1
    # print(len(ridx))
    # qF.close()
    qidx = []
    qSession = []
    for line in qF.readlines():
        qline = line.strip()
        # print(qline)
        if qline != 'END':
            qWords = qline.split()
            qWords = [word + " " for word in qWords]
            # print(qWords)
            qWords = Dicts.convertToIdx(qWords, dict.UNK_WORD)
            # print(qWords)
            if len(qWords) < opt.q_length:
                # [qline.append(0) for i in range(len(q_length) - len(qline))]
                qWords.extend([0] * (opt.q_length - len(qWords)))
            else:
                qWords = qWords[-opt.q_length:]
            # print(len(qWords))
            qSession.append(qWords)

        else:
            # print(len(qSession))

            if len(qSession) <= opt.utter_length:
                qSession.extend(
                    (opt.utter_length - len(qSession)) * [[0] * opt.q_length])
                # print(len(qSession))
            else:
                qSession = qSession[-opt.utter_length:]
            # print(len(qSession))
            qidx.append(qSession)
            qSession = []
            # print(qidx)
            # print(type(qidx))
    # print(qidx)
    # if count % opt.report_every == 0:
    # print('... %d sentences prepared' % count)
    print(len(qidx), len(ridx))
    qF.close()
    aF.close()

    return dataset(torch.Tensor(qidx), torch.Tensor(ridx))
Example #9
0
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False):
    src, tgt = [], []
    raw_src, raw_tgt = [], []
    sizes = []
    count, ignored = 0, 0

    print('Processing %s & %s ...' % (srcFile, tgtFile))
    srcF = open(srcFile, encoding='utf8')
    tgtF = open(tgtFile, encoding='utf8')

    while True:
        sline = srcF.readline()
        tline = tgtF.readline()

        # normal end of file
        if sline == "" and tline == "":
            break

        # source or target does not have same number of lines
        if sline == "" or tline == "":
            print(
                'WARNING: source and target do not have the same number of sentences'
            )
            break

        sline = sline.strip()
        tline = tline.strip()

        # source and/or target are empty
        if sline == "" or tline == "":
            print('WARNING: ignoring an empty line (' + str(count + 1) + ')')
            ignored += 1
            continue

        sline = sline.lower()
        tline = tline.lower()

        srcWords = sline.split()
        tgtWords = tline.split()

        if opt.trun_src > 0:
            srcWords = srcWords[:opt.trun_src]
        if opt.trun_tgt > 0:
            tgtWords = tgtWords[:opt.trun_tgt]

        srcWords = [word + " " for word in srcWords]
        tgtWords = [word + " " for word in tgtWords]

        src += [srcDicts.convertToIdx(srcWords, dict.UNK_WORD)]
        tgt += [
            tgtDicts.convertToIdx(tgtWords, dict.UNK_WORD, dict.BOS_WORD,
                                  dict.EOS_WORD)
        ]
        raw_src += [srcWords]
        raw_tgt += [tgtWords]
        sizes += [len(srcWords)]

        count += 1

        if count % 1000 == 0:
            print('... %d sentences prepared' % count)

    srcF.close()
    tgtF.close()

    print('Prepared %d sentences (%d ignored due to length == 0)' %
          (len(src), ignored))

    return dataset(src, tgt, raw_src, raw_tgt)