def main():
    parser = argparse.ArgumentParser()
    #will probmot help instead of erro message

    parser.add_argument("-dicom_dir",
                        help='direcotry of the dicom files',
                        type=str)

    parser.add_argument(
        "-output_dir",
        help='output file name where all the .png files will be stored',
        type=str)

    parser.add_argument("-num_workers",
                        help='number_of_cpu',
                        type=int,
                        default=16)

    args = parser.parse_args()

    try:
        os.mkdir(args.output_dir)
    except:
        pass

    convert_to_png_ = partial(convert_to_png, dir_img=args.output_dir)

    parallel(convert_to_png_,
             list(Path(args.dicom_dir).iterdir()),
             max_workers=args.num_workers)
Example #2
0
def create_dataset(path_fullRes: Path, path_list, downsize=True):
    il = ImageList.from_folder(path_fullRes)

    for p, size, qf in path_list:
        if not p.exists():
            print(f"Creating {p}")
            print(f"Size: {size} with {qf} quality factor")
            parallel(partial(create_training_images,
                             p_hr=path_fullRes,
                             p_lr=p,
                             size=size,
                             qualityFactor=qf,
                             downsize=downsize), il.items)
Example #3
0
def preprocess_parallel(arr, size, directory, target):
    args = [(name, size, directory, target) for name in arr]
    status = parallel(preProcess, args)
    if status == len(arr):
        print("STATUS_CODE == OK")
    else:
        print("STATUS_CODE == NOT ALL FILES COULD BE LOADED {}/{}".format(
            sum(status), len(arr)))
Example #4
0
def get_all_issue_text(owner, repo, inf_wrapper, workers=64):
    """
    Prepare embedding features of all issues in a given repository.

    Returns
    ------
    dict
        {'features':list, 'labels':list, 'nums':list}
    """
    # prepare list of issue nums
    max_num = find_max_issue_num(owner, repo)

    get = partial(get_issue_text, owner=owner, repo=repo, skip_issue=True)
    issues = parallel(get, list(range(1, max_num + 1)), max_workers=workers)
    # filter out issues with problems
    filtered_issues = []

    for issue in issues:
        if issue:
            filtered_issues.append(issue)

    logging.info(f'Retrieved {len(filtered_issues)} issues.')

    features = []
    labels = []
    nums = []
    for issue in tqdm(filtered_issues):
        labels.append(issue['labels'])
        nums.append(issue['num'])
        # calculate embedding
        text = inf_wrapper.process_dict(issue)['text']
        feature = inf_wrapper.get_pooled_features(text).detach().cpu()
        # only need the first 1600 dimensions
        features.append(feature[:, :1600])

    assert len(features) == len(
        labels
    ), 'Error you have mismatch b/w number of observations and labels.'

    return {
        'features': torch.cat(features).numpy(),
        'labels': labels,
        'nums': nums
    }
Example #5
0
def get_all_issue_text(owner, repo, inf_wrapper, workers=64):
    """
    Prepare embedding features of all issues in a given repository.

    Returns
    ------
    dict
        {'features':list, 'labels':list, 'nums':list}
    """
    # prepare list of issue nums
    max_num = find_max_issue_num(owner, repo)

    get = partial(get_issue_text, owner=owner, repo=repo, skip_issue=True)
    issues = parallel(get, list(range(1, max_num + 1)), max_workers=workers)
    # filter out issues with problems
    filtered_issues = []

    if not issues:
        raise ValueError(f"No issues retrieved for {owner}/{repo}")
    for issue in issues:
        if issue:
            filtered_issues.append(issue)

    logging.info(
        f'Repo {owner}/{repo} Retrieved {len(filtered_issues)} issues.')

    features = []
    labels = []
    nums = []
    issues_dict = {'title': [], 'body': []}
    for issue in tqdm(filtered_issues):
        labels.append(issue['labels'])
        nums.append(issue['num'])
        issues_dict['title'].append(issue['title'])
        issues_dict['body'].append(issue['body'])

    features = inf_wrapper.df_to_embedding(pd.DataFrame.from_dict(issues_dict))

    assert len(features) == len(
        labels
    ), 'Error you have mismatch b/w number of observations and labels.'

    return {'features': features[:, :1600], 'labels': labels, 'nums': nums}
def download_wiki_images(wiki_csv,
                         dest,
                         dest_csv=None,
                         force_download=False,
                         max_rows=None,
                         max_workers=8):
    """
    Download wiki images from the wiki csv. Save a csv with an added column to the path of the image.
    
    """
    dest_csv = dest_csv or os.path.join(dest, 'downloaded.csv')

    def try_convert_to_list(x):
        try:
            r = ast.literal_eval(x)
            return r if isinstance(r, list) else None
        except:
            return None

    if not os.path.exists(dest):
        os.makedirs(dest)
    if os.path.exists(dest_csv) and not force_download:
        df = pd.read_csv(dest_csv)
        df['images'] = df['images'].apply(try_convert_to_list)
        df = df[df['images'].notnull()]
        return df

    df = pd.read_csv(wiki_csv)
    df['images'] = df['images'].apply(try_convert_to_list)
    df = df[df['images'].notnull()]
    if max_rows != None:
        df = df.iloc[:max_rows]
    paths = parallel(partial(download_single_image, dest),
                     df['images'],
                     max_workers=max_workers)
    df['image_path'] = pd.Series(dict(paths)).drop_duplicates()
    df = df[df['image_path'].notnull()]
    df.to_csv(dest_csv, index=False)
    return df
Example #7
0
 def __call__(self, items):
     toks = []
     if isinstance(items[0], Path): items = [read_file(i) for i in items]
     chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
     toks = parallel(self.proc_chunk, chunks, max_workers=8)
     return sum(toks, [])
Example #8
0
def get_largest_img_size(img_list, path=None, max_workers=20):
    if path: img_list = [f'{path}/{fn}' for fn in img_list]
    imgs_shape = np.array(
        parallel(get_img_dimension, img_list, max_workers=max_workers)).T
    return [max(imgs_shape[0]), max(imgs_shape[1]), max(imgs_shape[2])]
Example #9
0
def window_and_normalize(im):
    rescaled = im.pixel_array * float(im.RescaleSlope) + float(im.RescaleIntercept)
    windowed = rescaled.clip(min=window_center-window_width, max=window_center+window_width)

    return (windowed + np.negative(window_center-window_width)) / (window_width * 2 * 1/255)

plt.imshow(window_and_normalize(im), cmap=plt.cm.bone)

def resize(src, dst, sz):
    im = pydicom.read_file(str(src))
    ary = window_and_normalize(im)
    im = PIL.Image.fromarray(ary.astype(np.int8), mode='L')
    im.resize((sz,sz), resample=PIL.Image.BICUBIC).save(f'{dst}/{src.stem}.png')
    
import pandas as pd
df = pd.read_csv('/media/docear/My Passport/Kaggle/Hemorrhage/stage_1_train.csv') # path to your CSV
df[df.ID.str.match('ID_6431af929')]
df = df[~df.ID.str.match('ID_6431af929')]
df.to_csv(revisedtrainCSVpath, index=False)

    
print('Processing Train Set')
def resize_112(path, _): resize(path, '/media/docear/My Passport/Kaggle/Hemorrhage/data/112/train', 112) # set the destination file address to a new folder that will accept the processed training set
parallel(resize_112, list(paths.iterdir()), max_workers=12)

print('Processing Test Set')
def resize_112_test(path, _): resize(path, '/media/docear/My Passport/Kaggle/Hemorrhage/data/112/test', 112) # set the destination file address to a new folder that will accept the processed test set
parallel(resize_112_test, list(pathsTest.iterdir()), max_workers=12)

    numOfImages = len(files)

    t = time.time()
    for file in files:
        with Image.open(os.path.join(INPATH, file)) as im:
            for i in range(1, tilesPerImage + 1):
                newname = file.replace('.', '_{:03d}.'.format(i))
                w, h = im.size
                x = random.randint(0, w - dx - 1)
                y = random.randint(0, h - dy - 1)
                #print("Cropping {}: {},{} -> {},{}".format(file, x,y, x+dx, y+dy))
                crop = im.crop((x, y, x + dx, y + dy))
                resize = crop.resize((200, 200), Image.ANTIALIAS)
                resize.save(os.path.join(OUTPATH, newname))

    t = time.time() - t
    print("Done {} images in {:.2f}s".format(numOfImages, t))
    print("({:.1f} images per second)".format(numOfImages / t))
    print("({:.1f} tiles per second)".format(tilesPerImage * numOfImages / t))


INPATH = r"data"
LIST = ['1', '2', '5', '8']
OUTPATH = r"resized"
CROP = 256
NUMBER_CROPS = 5
RESIZE = 200

parallel(generate(INPATH, LIST, OUTPATH, CROP, NUMBER_CROPS, RESIZE),
         [x for x in range(len(os.listdir(INPATH)))])
Example #11
0
def main():
    model = StegNet(10, 6)
    print("Created Model")

    if args.train:
        data_train = ImageLoader(args.datapath + '/train', args.num_train,
                                 args.fourierSeed, args.size, args.bs)
        data_val = ImageLoader(args.datapath + '/val', args.num_val,
                               args.fourierSeed, args.size, args.bs)
        data = DataBunch(data_train, data_val)

        print("Loaded DataSets")

        if args.model is not None:
            model.load_state_dict(torch.load(args.model))
            print("Loaded pretrained model")

        loss_fn = mse

        learn = Learner(data,
                        model,
                        loss_func=loss_fn,
                        metrics=[mse_cov, mse_hidden])

        print("training")
        fit_one_cycle(learn, args.epochs, 1e-2)

        torch.save(learn.model.state_dict(), "model.pth")
        print("model saved")

    else:
        path = input(
            "Enter path of the model: ") if args.model is None else args.model
        model.load_state_dict(torch.load(args.model))
        model.eval()

        if args.encode:
            f_paths = [
                args.datapath + '/cover/' + f
                for f in os.listdir(args.datapath + '/cover')
            ]
            try:
                os.mkdir(args.datapath + '/encoded')
            except OSError:
                pass
            fourier_func = partial(encrypt, seed=args.fourierSeed)
            encode_partial = partial(encode,
                                     model=model.encoder,
                                     size=args.size,
                                     fourier_func=fourier_func)
            parallel(encode_partial, f_paths)

        else:
            f_paths = [
                args.datapath + '/encoded/' + f
                for f in os.listdir(args.datapath + '/encoded')
            ]
            try:
                os.mkdir(args.datapath + '/decoded')
            except OSError:
                pass
            fourier_func = partial(decrypt, seed=args.fourierSeed)
            decode_partial = partial(decode,
                                     model=model.decoder,
                                     size=args.size,
                                     fourier_func=fourier_func)
            parallel(decode_partial, f_paths)