Exemple #1
0
 def __init__(
     self,
     urls,
     *,
     length=None,
     open_fn=gopen.reader,
     handler=reraise_exception,
     tarhandler=None,
     prepare_for_worker=True,
     initial_pipeline=None,
     shard_selection=worker_urls,
 ):
     tarhandler = handler if tarhandler is None else tarhandler
     IterableDataset.__init__(self)
     SampleIterator.__init__(
         self,
         initial_pipeline=initial_pipeline,
         tarhandler=tarhandler,
         open_fn=open_fn,
     )
     if isinstance(urls, str):
         urls = list(braceexpand.braceexpand(urls))
     self.urls = urls
     self.length = length
     self.handler = handler
     self.total = 0
     self.reseed_hook = do_nothing
     self.node_selection = identity
     self.shard_selection = shard_selection
     self.shard_shuffle = identity
Exemple #2
0
def list_connected_datapipes(scan_obj, exclude_primitive):

    f = io.BytesIO()
    p = pickle.Pickler(f)  # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is

    def stub_pickler(obj):
        return stub_unpickler, ()

    captured_connections = []

    def getstate_hook(obj):
        state = {}
        for k, v in obj.__dict__.items():
            if callable(v) or isinstance(v, PRIMITIVE):
                continue
            state[k] = v
        return state

    def reduce_hook(obj):
        if obj == scan_obj:
            raise NotImplementedError
        else:
            captured_connections.append(obj)
            return stub_unpickler, ()

    # TODO(VitalyFedyunin):  Better do it as `with` context for safety
    IterableDataset.set_reduce_ex_hook(reduce_hook)
    if exclude_primitive:
        IterableDataset.set_getstate_hook(getstate_hook)
    p.dump(scan_obj)
    IterableDataset.set_reduce_ex_hook(None)
    if exclude_primitive:
        IterableDataset.set_getstate_hook(None)
    return captured_connections
Exemple #3
0
def train_val_split(path, batch_size=64, decoder=None):
    samples = glob.glob(path)
    np.random.shuffle(samples)
    nb_train = math.ceil(0.9 * len(samples))  # 共有10万+样本,9万用于训练,1万+用于验证
    train_samples = samples[:nb_train]
    train_dataset = IterableDataset(
        train_samples, transform=None, decoder=decoder)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)

    val_samples = samples[nb_train:]

    val_dataset = IterableDataset(val_samples, transform=None, decoder=decoder)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    return train_loader, val_loader, train_samples, val_samples
Exemple #4
0
 def __init__(
     self, dataset=None, workers=4, output_size=100, pin_memory=True, prefetch=-1
 ):
     IterableDataset.__init__(self)
     omp_warning()
     self.output_queue = mp.Queue(output_size)
     self.pin_memory = pin_memory
     self.jobs = []
     for i in range(workers):
         job = mp.Process(
             target=_parallel_job,
             args=(dataset, i, workers, prefetch, self.output_queue),
             daemon=True,
         )
         self.jobs.append(job)
         job.start()
     D("started")
Exemple #5
0
def list_connected_datapipes(scan_obj):

    f = io.BytesIO()
    p = pickle.Pickler(f)  # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is

    def stub_pickler(obj):
        return stub_unpickler, ()

    captured_connections = []

    def reduce_hook(obj):
        if obj == scan_obj:
            raise NotImplementedError
        else:
            captured_connections.append(obj)
            return stub_unpickler, ()

    # TODO(VitalyFedyunin):  Better do it as `with` context for safety
    IterableDataset.set_reduce_ex_hook(reduce_hook)
    p.dump(scan_obj)
    IterableDataset.set_reduce_ex_hook(None)
    return captured_connections
Exemple #6
0
 def __init__(self):
     IterableDataset.__init__(self)
     self.images_and_density_maps = pipeline_results
     self.image_transform = torch_transforms.Compose([
         torch_transforms.ToTensor()
     ])
Exemple #7
0
def torchIterableDataset():
    from torch.utils.data import Dataset, IterableDataset, DataLoader
    import torchvision.transforms as transforms
    import numpy as np
    from PIL import Image
    import os
    train_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomHorizontalFlip(),  # 隨機將圖片水平翻轉
        transforms.RandomRotation(15),  # 隨機旋轉圖片
        # 將圖片轉成 Tensor,並把數值 normalize 到 [0,1] (data normalization)
        transforms.ToTensor(),
    ])

    test_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor(),
    ])

    class IterableDataset(IterableDataset):
        def __init__(self, filepath, transform=None):
            # super().__init__()
            self.filepath = filepath
            img_size = (50, 120)
            self.width, self.height = img_size[1], img_size[0]
            self.transform = transform

        def parseFile(self, filepath):
            with open(filepath, 'r') as f:
                for line in f:

                    token = line.strip('\n').strip(' ')
                    print('-----', line, token)
                    yield from token

        # def get_stream(self, filepath):
        # from itertools import cycle
        # return self.parseFile(filepath)

        def read_img(self, img_dir):
            # print(img_dir)
            for img in img_dir:
                print(img)
                img_gray = Image.open(img).convert('L')
                img_two = img_gray.point(lambda x: 255 if x > 129 else 0)

                one_channel = cv2.resize(np.array(img_two),
                                         (self.width, self.height))
                x = np.array([one_channel, one_channel,
                              one_channel]).transpose(1, 2, 0)

                #x = cv2.resize(cv2.imread(img), (self.width, self.height))

                if self.transform is not None:
                    x = self.transform(x)

                # print(numpy.array(x).transpose(1,2 , 0).shape)
                # cv2.imshow('new', numpy.array(x).transpose(1,2 , 0))
                # cv2.waitKey(0)
                y = [keys.get(i) for i in img[-8:-4].lower()]

                # print('---', x,y)
                yield x, np.array(y)

        def __iter__(self):
            return self.read_img(self.filepath)

    samples = glob.glob(r'/Users/faith/Downloads/captcha-dataset/label/*png')

    for s in samples:
        if len(os.path.basename(s)) != 8:
            print(s)
            os.remove(s)

    np.random.shuffle(samples)
    nb_train = math.ceil(0.9 * len(samples))  # 共有10万+样本,9万用于训练,1万+用于验证
    train_samples = samples[:nb_train]
    test_samples = samples[nb_train:]

    train_dataset = IterableDataset(train_samples, transform=train_transform)
    train_loader = DataLoader(train_dataset, batch_size=2)
    print(train_loader)
    letter_list = [chr(i) for i in range(97, 123)]
    char_list = [str(i) for i in range(0, 10)] + letter_list

    keys = {}
    values = {}
    for i, c in enumerate(char_list):
        keys[c] = i
        values[i] = c
    for i, data in enumerate(train_loader):
        print(i, data)
        break
    # test_dataset = IterableDataset(test_samples, transform=test_transform)
    # test_loader = DataLoader(test_dataset, batch_size=100)

    import pretrainedmodels
    import torch.nn as nn

    class CaptchaModel(nn.Module):
        def __init__(self, num_classes=len(keys)):
            super(CaptchaModel, self).__init__()
            model_name = 'xception'
            self.model = pretrainedmodels.__dict__[model_name](
                num_classes=1000, pretrained='imagenet')
            conv1 = self.model.conv1
            self.model.conv1 = nn.Conv2d(in_channels=3,
                                         out_channels=conv1.out_channels,
                                         kernel_size=conv1.kernel_size,
                                         stride=conv1.stride,
                                         padding=conv1.padding,
                                         bias=conv1.bias)

            # copy pretrained weights
            self.model.conv1.weight.data[:, :3, :, :] = conv1.weight.data
            self.model.conv1.weight.data[:,
                                         3:, :, :] = conv1.weight.data[:, :
                                                                       1, :, :]

            self.model.avgpool = nn.AdaptiveAvgPool2d(1)
            in_features = self.model.last_linear.in_features
            self.model.last_linear = nn.Linear(in_features, num_classes)

        def forward(self, x):
            return self.model(x), self.model(x), self.model(x), self.model(x)