def gen_slide_names(path):
    files = FilesScanner(path, ['.bmp']).get_files()

    # # 1-p0.6042_BD1607254-子宫内膜C_2018-10-09 16_42_03_x23043_y40485_w162_h218_2x.jpg
    pattern00 = re.compile(
        r'1-p\d\.\d+_(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)(_\dx)?.bmp')

    # 2018-03-22-11_26_58_x15789_y31806_w63_h61_s385.jpg
    pattern01 = re.compile(r'(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)(_s\d+)?.bmp')

    names = []
    for item in files:
        # 细胞图文件名
        basename = os.path.basename(item).replace(' ', '-')

        items = re.findall(pattern00, basename)
        if not items:
            items = re.findall(pattern01, basename)

        if items:
            parent_name, x, y, w, h, _ = items[0]
            if parent_name not in names:
                names.append(parent_name)
        else:
            raise Exception("%s IS NOT ACCEPTED!" % basename)
            exit()

    return names
def find_abnormal():
    # 1-p0.1718_TC18050036_x34939_y52118_w107_h105_2x.jpg
    # 1-p0.5982_TC18053765_x46070_y20472_w26_h28_.jpg
    pattern = re.compile(
        r'1-p0.\d{4}_(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)_?(\dx)?.jpg')

    src_path = "/home/cnn/Development/DATA/CELL_CLASSIFIED_JOB_20181022/CELLS/TIFFS_CHECKED"
    dst_path = "/home/cnn/Development/DATA/CELL_CLASSIFIED_JOB_20181022/CELLS/ABNORMAL_IMAGE_COLLECTIONS"

    images = FilesScanner(src_path, ['.jpg']).get_files()

    for path in images:
        basename = os.path.basename(path)
        items = re.findall(pattern, basename)

        if items:
            # print(items)
            pass
        else:
            print(basename)

        big_name, x, y, w, h, _ = items[0]

        if int(w) > 500 or int(h) > 500:
            shutil.copyfile(
                path,
                os.path.join(dst_path,
                             "%s_%s_%s_%s_%s.jpg" % (big_name, x, y, w, h)))
def collect_tiff_ctype_collection(path):
    dict_ = {}

    images = FilesScanner(path).get_files()
    for image in images:
        basename = os.path.basename(image)
        tiff_name, x, y, w, h, s = re.findall(PATTERN, basename)[0]

        ctype = os.path.basename(os.path.dirname(image))

        if tiff_name in dict_:
            lst = dict_[tiff_name]
        else:
            lst = []

        if ctype in lst:
            pass
        else:
            lst.append(ctype)

        dict_[tiff_name] = lst

    with open('tiff_children_distribution.txt', 'w') as o:
        for key, lst in dict_.items():
            o.write("%s\t%s\n" % (key, "\t".join(lst)))
Example #4
0
def do_similar_remove(path):
    files = FilesScanner(path).get_files()

    pattern = re.compile(r'(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)_s(\d+).jpg')

    dict_ = {}
    total = len(files)
    for index, file in enumerate(files):
        print("%s / %s ..." % (index + 1, total))
        basename = os.path.basename(file)
        cell_type = os.path.basename(os.path.dirname(file))

        items = re.findall(pattern, basename)[0]
        big_name, x, y, w, h, _ = items

        x, y, w, h = int(x), int(y), int(w), int(h)
        if big_name in dict_:
            lst = dict_[big_name]
            for item in lst:
                x_, y_, w_, h = item[:-1]
                if cal_IOU((x, y, w, h), (x_, y_, w_, h)) > 0.6:
                    save_path = os.path.join(REPEAT_FILE_SAVE_PATH, cell_type)
                    if not os.path.exists(save_path):
                        os.makedirs(save_path)

                    shutil.move(file, save_path)
                    # shutil.copy(item[-1], save_path)
                    # shutil.copy(file, save_path)
            else:
                dict_[big_name].append((x, y, w, h, file))
        else:
            dict_[big_name] = [(x, y, w, h, file)]

    return dict_
def tiff_readable_check(path):
    """
    病理图像可读性验证
    :param path: 原图路径
    :return:
    """

    files = FilesScanner(path, ['.tif', 'kfb']).get_files()
    filename_lst = []
    filepath_lst = []

    for file in files:
        basename = os.path.basename(file)

        if basename in filename_lst:
            raise Exception("%s\n%s" %
                            (file, filepath_lst[filename_lst.index(basename)]))
        else:
            filename_lst.append(basename)
            filepath_lst.append(file)

    for file in files:
        try:
            try:
                slide = openslide.OpenSlide(file)
            except:
                slide = TSlide(file)
        except Exception as e:
            raise Exception("%s %s" % (file, str(e)))
    def run(self):
        print("Initial DARKNET and XCEPTION model ...")

        total = len(self.tiff_lst)
        for index, tiff in enumerate(self.tiff_lst):
            # 获取大图文件名,不带后缀
            tiff_basename, _ = os.path.splitext(os.path.basename(tiff))
            tiff_basename = tiff_basename.replace(" ", "-")
            print('Process %s / %s %s ...' % (index + 1, total, tiff_basename))

            # 切片文件存储路径
            slice_save_path = os.path.join(self.slice_dir_path, tiff_basename)

            t0 = datetime.datetime.now()
            # 如果路径下切图文件不存在,执行切图
            if not os.path.exists(slice_save_path):
                # 执行切图
                ImageSlice(tiff, self.slice_dir_path).get_slices()

            # 获取切图文件路径
            tif_images = FilesScanner(slice_save_path, ['.jpg']).get_files()
            t1 = datetime.datetime.now()
            print('TIFF SLICE COST: %s' % (t1 - t0))

            seg_results = DarknetPredict().predict(tif_images)
            t2 = datetime.datetime.now()
            print('TIFF DARKNET COST: %s' % (t2 - t1))

            # save segment result into csv
            xcep_pre = XceptionPreprocess(tiff)
            seg_csv = os.path.join(self.meta_files_path,
                                   tiff_basename + "_seg.csv")
            xcep_pre.write_csv(seg_results, seg_csv)

            # generate numpy array, it is the input of second stage classification algorithm
            cell_numpy, cell_index = xcep_pre.gen_np_array_csv(seg_csv=seg_csv)

            # run classification
            predictions = XceptionPredict().predict(np.asarray(cell_numpy))
            t3 = datetime.datetime.now()
            print('XCEPTION COST: %s' % (t3 - t2))

            # summarize two stages' result and generate a final csv
            clas = XceptionPostprocess()
            clas_dict = clas.convert_all(predictions=predictions,
                                         cell_index=cell_index)
            clas_csv = os.path.join(self.meta_files_path,
                                    tiff_basename + "_clas.csv")
            clas.write_csv(clas_dict, clas_csv)

            ############################### 获取审核图像 ######################################################
            # GET VIEW CELL IMAGES
            clas.cut_cells_p_marked(tiff,
                                    clas_dict,
                                    self.cells_path,
                                    factor=0.2,
                                    N=2)
            t4 = datetime.datetime.now()
            print("TIFF %s TOTAL COST %s ..." % (tiff_basename, t4 - t0))
Example #7
0
def generate_xml_path_dict(xml_path_lst):
    """
    生成 xml 文件 <名称: 路径> dict
    :param xml_path_lst:  xml 文件路径
    :return:  dict
    """

    files = FilesScanner(xml_path_lst, ['.xml']).get_files()

    dict_ = {}
    for file in files:
        basename = os.path.basename(file).replace(".xml", "")
        dict_[basename] = file

    return dict_
def collect(path):
    lst = []

    images = FilesScanner(path).get_files()
    for image in images:
        basename = os.path.basename(image)
        ctype = os.path.basename(os.path.dirname(image))

        if ctype in SELECTED:
            tiff_name, x, y, w, h, s = re.findall(PATTERN, basename)[0]
            if tiff_name not in lst:
                lst.append(tiff_name)

    with open("work_tiff_list_20181102_SELECTED.txt", 'w') as o:
        o.write("%s" % ("\n".join(lst)))
Example #9
0
def generate_csv_path_dict(csv_files_path):
    """
    模型生成的 csv 文件<名称:路径> dict
    :param csv_files_path:  csv 文件路径
    :return:  dict
    """

    files = FilesScanner(csv_files_path, ['.csv']).get_files()

    dict_ = {}
    for file in files:
        basename = os.path.basename(file)
        if basename.endswith('_clas.csv'):
            basename = basename.replace("_clas.csv", "")
            dict_[basename] = file

    return dict_
def get_abnormal_tiff_list():
    src_path = '/home/cnn/Development/DATA/CELL_CLASSIFIED_JOB_20181022/CELLS/ABNORMAL_IMAGE_COLLECTIONS'

    files = FilesScanner(src_path, ['.jpg']).get_files()

    lst = []
    for item in files:
        basename = os.path.basename(item)
        key = basename.split('_')[0]

        if key in lst:
            pass
        else:
            lst.append(key)

    with open("4x_tiff_lst.txt", 'w') as o:
        o.write("\n".join(lst))
Example #11
0
def collect(image_root_path, collect_files_save_path):
    images = FilesScanner(path, ['.jpg']).get_files()

    for image in images:
        basename = os.path.basename(image)
        cell_type = os.path.basename(os.path.dirname(image))

        if '_' in cell_type:
            cell_type = cell_type.split('_')[0]

        if '-' in cell_type:
            cell_type = cell_type.split('-')[0]

        if cell_type in PATHOLOGY_TYPE_CLASSES:
            pass
        else:
            raise Exception("%s NOT CLASSIFIED" % image)
Example #12
0
def find_size_over_608(path):
    images = FilesScanner(path, ['.jpg']).get_files()

    total = len(images)
    for index, image in enumerate(images):
        basename = os.path.basename(image)
        ctype = os.path.basename(os.path.dirname(image))

        print("%s / %s %s" % (index + 1, total, basename))
        img = Image.open(image)
        w, h = img.size

        if w > 608 or h > 608:
            save_path = os.path.join(OUT_PUT_PATH, ctype)
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            shutil.copy(image, save_path)
Example #13
0
def random_cells_cut_progress(
    in_dir,
    out_path,
    start,
    end,
    num,
    size,
):
    """
    多进程切图方法
    :param in_dir: 输入 TIFF 文件路径
    :param out_path: 输出 PATCH 存放路径
    :param start: 切图范围起点
    :param end: 切图范围终点
    :param num: 单文件所需切图数量
    :param size: 切图文件大小
    :return:
    """

    kfbs = FilesScanner(in_dir, ['.kfb']).get_files()

    # 设置进程池
    executor = ProcessPoolExecutor(max_workers=20)
    tasks = []
    for index, path in enumerate(kfbs):
        tasks.append(
            executor.submit(worker, path, out_path, (start, end), num,
                            (size, size)))

    job_count = len(tasks)

    # 失败任务统计
    fail_task_collection = []
    for future in as_completed(tasks):
        status, _ = future.result()
        if status == 1:
            fail_task_collection.append(_)

        job_count -= 1
        print("LAST JOB NUM %s" % job_count)

    print('\n'.join(fail_task_collection))
Example #14
0
def do_repeat_remove(path):
    files = FilesScanner(path).get_files()

    pattern = re.compile(r'(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)_s(\d+).jpg')

    dict_ = {}
    for file in files:
        basename = os.path.basename(file)
        cell_type = os.path.basename(os.path.dirname(file))

        items = re.findall(pattern, basename)[0]
        key = "_".join(items[:-1])

        if key not in dict_:
            dict_[key] = items[1:-1]
        else:
            save_path = os.path.join(REPEAT_FILE_SAVE_PATH, cell_type)
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            # shutil.move(file, save_path)
            shutil.copy(dict_[key], save_path)

    return dict_
def restore_tiff_children_lst(path):
    """
    返回训练数据 路径及细胞类别字典
    :param path:
    :return:
    """
    images = FilesScanner(path, ['.jpg']).get_files()
    print("TRAIN_DATA IMAGE COUNT: %s" % len(images))

    # TC18053113_x54903_y33619_w465_h522_s95.jpg
    pattern = re.compile(r'(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)_s(\d+).jpg')

    dict_ = {}
    for image in images:
        basename = os.path.basename(image)
        ctype = os.path.basename(os.path.dirname(image))

        tiff_name, x, y, w, h, s = re.findall(pattern, basename)[0]
        if tiff_name in dict_:
            dict_[tiff_name].append((ctype, image))
        else:
            dict_[tiff_name] = [(ctype, image)]

    return dict_
    dst = "/home/cnn/Development/DATA/RECHECK_DATA_IN_20181026"

    with open("names_lst.txt") as f:
        lines = f.readlines()
        already_exist_images = [
            "_".join(line.replace("\n", '').split("_")[:-1]) for line in lines
        ]

    print(already_exist_images[:100])

    pattern = re.compile(
        r'1-p0.\d{4}_(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)_?(\dx)?.jpg')

    for item in lst:
        images = FilesScanner(item, ['.jpg']).get_files()
        for name in images:
            basename = os.path.basename(name)
            big_name, x, y, w, h, _ = re.findall(pattern, basename)[0]
            basename = "%s_x%s_y%s_w%s_h%s" % (big_name, x, y, w, h)

            cell_type = os.path.basename(os.path.dirname(name))

            if basename not in already_exist_images:
                print("WA %s is NEW " % basename)
                save_path = os.path.join(dst, 'NO_CHECK', big_name, cell_type)
            else:
                print("EEEE %s is EXIST!" % basename)
                save_path = os.path.join(dst, 'CHECKED', cell_type)

            if not os.path.exists(save_path):
Example #17
0
    def run(self):
        print("Initial DARKNET and XCEPTION model ...")

        total = len(self.tiff_lst)
        for index, tiff in enumerate(self.tiff_lst):
            # 获取大图文件名,不带后缀
            tiff_basename, _ = os.path.splitext(os.path.basename(tiff))
            tiff_basename = tiff_basename.replace(" ", "-")
            print('Process %s / %s %s ...' % (index + 1, total, tiff_basename))

            # 切片文件存储路径
            slice_save_path = os.path.join(self.slice_dir_path, tiff_basename)

            t0 = datetime.datetime.now()
            # 如果路径下切图文件不存在,执行切图
            if not os.path.exists(slice_save_path):
                # 执行切图
                ImageSlice(tiff, self.slice_dir_path).get_slices()

            # 获取切图文件路径
            tif_images = FilesScanner(slice_save_path, ['.jpg']).get_files()
            t1 = datetime.datetime.now()
            print('TIFF SLICE COST: %s' % (t1 - t0))

            tasks = []

            # 创建切图进程池
            executor = ProcessPoolExecutor(max_workers=GPU_NUM)

            if len(tif_images) < cfg.darknet.min_job_length:
                tasks.append(executor.submit(yolo_predict, '0', tif_images))
            else:
                # 任务切分
                n = int((len(tif_images) / float(GPU_NUM)) + 0.5)
                patches = [tif_images[i: i + n] for i in range(0, len(tif_images), n)]

                for gpu_index, patch in enumerate(patches):
                    tasks.append(executor.submit(yolo_predict, str(gpu_index), patch))

            seg_results = {}
            for future in as_completed(tasks):
                result = future.result()
                seg_results.update(result)

            # 关闭进程池
            executor.shutdown(wait=True)

            try:
                slide = openslide.OpenSlide(tiff)
            except:
                slide = TSlide(tiff)

            keys = list(seg_results.keys())
            for key in keys:
                lst = seg_results[key]
                x0, y0 = key.split('_')
                x0, y0 = int(x0), int(y0)

                for item in lst:
                    label, accuracy, (x, y, w, h) = item
                    accuracy, x, y, w, h = float(accuracy), int(x), int(y), int(w), int(h)
                    x, y = x0 + x, y0 + y

                    save_path = os.path.join(self.cells_path, tiff_basename, label)
                    if not os.path.exists(save_path):
                        os.makedirs(save_path)

                    image_name = "1-p{:.4f}_{}_x{}_y{}_w{}_h{}.jpg".format(1 - accuracy, tiff_basename, x, y, w, h)
                    slide.read_region((x, y), 0, (w, h)).convert("RGB").save(os.path.join(save_path, image_name))
def get_cell_image(path, ctype, parent_pathes):
    """
    获取细胞文件路径
    :param path: 细胞图像路径
    :param ctype: 标注类别 MANUAL or AUTO
    :param parent_pathes: 大图名称及对应路径字典
    :return:
    """

    # 检查本地有无细胞图像文件路径信息文件
    # 如果存在,则直接读取
    # 如果没有,通过 FileScanner 工具类获取并写入本地文件

    # image_path_info_dict_path = ctype + '_IMAGES_PATH_DICT.txt'
    # check_name = os.path.join(METADATA_FILE_PATH, image_path_info_dict_path)

    # if os.path.exists(check_name):
    #     with open(os.path.join(METADATA_FILE_PATH, image_path_info_dict_path)) as f:
    #         files = [item.replace('\n', '') for item in f.readlines()]
    # else:
    files = FilesScanner(path, ['.bmp', '.jpg']).get_files()
    # with open(os.path.join(METADATA_FILE_PATH, image_path_info_dict_path), 'w') as o:
    #     o.writelines([item + '\n' for item in files])

    # 根据细胞图像文件名生成细胞坐标信息
    cells_dict = {}

    # # 1-p0.6042_BD1607254-子宫内膜C_2018-10-09 16_42_03_x23043_y40485_w162_h218_2x.jpg
    pattern00 = re.compile(
        r'1-p\d\.\d+_(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)(_\dx)?.(bmp|jpg)')

    # 2018-03-22-11_26_58_x15789_y31806_w63_h61_s385.jpg
    pattern01 = re.compile(
        r'(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)(_s\d+)?.(bmp|jpg)')

    for item in files:
        if item.endswith('.bmp'):
            # 细胞图文件名
            basename = os.path.basename(item).replace(' ', '-')

            parent = os.path.dirname(item)
            # 细胞所属类别
            clas_type = os.path.basename(parent)

            if "_NEW" in clas_type or "_2" in clas_type or "_1" in clas_type:
                clas_type = clas_type.split("_")[0]

            parent = os.path.dirname(parent)

            items = re.findall(pattern00, basename)
            if not items:
                items = re.findall(pattern01, basename)

            if items:
                parent_name, x, y, w, h, _ = items[0]
            else:
                raise Exception("%s IS NOT ACCEPTED!" % basename)
                exit()

            # parent_name = os.path.basename(parent).replace(' ', '-')

            parent = os.path.dirname(parent)
            # 大图所属类别
            parent_type = os.path.basename(parent)

            # 大图原始路径
            try:
                parent_path = parent_pathes[parent_name]
            except Exception as e:
                print("%s NOT FOUND" % parent_name)
                print("CANNOT FIND RELATIVE TIFF PATH INFO, %s\n%s" %
                      (str(e), item))
                exit()

            # 解析坐标信息
            point = get_location_from_filename(basename)
            assert point, "THIS JPG NAME IS NOT ACCEPTED => %s" % basename

            _, x, y, w, h, _ = point
            x, y, w, h = int(x), int(y), int(w), int(h)

            # 修正 AGC 细胞类别
            if clas_type in AGC_CLASSES:
                clas_type = 'AGC'

            if parent_type in AGC_CLASSES:
                parent_type = 'AGC'

            # if parent_type not in PATHOLOGY_TYPE_CLASSES:
            # raise Exception(item + " PARENT_TYPE NOT FOUND")

            # 细胞位置及类别信息
            info = {
                'name': basename,
                'cell_type': clas_type,
                'cell_path': item,
                'parent': parent_name,
                'parent_full_name': os.path.basename(parent_path),
                'parent_type': parent_type,
                'x': x,
                'y': y,
                'w': w,
                'h': h,
            }

            if parent_name in cells_dict:
                cells_dict[parent_name].append(info)
            else:
                cells_dict[parent_name] = [info]

    # 将解析细胞数据按归属大图名称写入文件
    for key, lines in cells_dict.items():
        # 生成输出路径
        save_path = os.path.join(METADATA_FILE_PATH,
                                 ctype + '_IMAGES_PATH_DICT')
        os.makedirs(save_path, exist_ok=True)
        with open(os.path.join(save_path, key + '.txt'), 'w') as f:
            for line in lines:
                f.write(json.dumps(line) + '\n')

    return cells_dict
Example #19
0
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

from constants import SELECTED_CELL_XML_SAVE_PATH, MAX_CPU_WORKERS
from utils import FilesScanner, generate_selected_level_xml

if not os.path.exists(SELECTED_CELL_XML_SAVE_PATH):
    os.makedirs(SELECTED_CELL_XML_SAVE_PATH, exist_ok=True)

if __name__ == '__main__':
    # 读取指定位置的算法人员筛选后的细胞文件路径
    # cell_images_path = CELL_IMAGES_SAVE_PATH

    cell_images_path = '/home/cnn/Development/DATA/BATCH_4_TRAIN_DATA/CELLS/'
    print("SCANNING PATH %s..." % cell_images_path)
    cell_images_lst = FilesScanner(cell_images_path, ['.jpg']).get_files()
    print("CELLS COUNT: %s" % len(cell_images_lst))

    # 2018-03-22-11_26_58_x15789_y31806_w63_h61_s385.jpg
    pattern = re.compile(r'(.*?)_x(\d+)_y(\d+)_w(\d+)_h(\d+)(_s\d+)?.jpg')

    print("COLLECT POINT INFO FROM JPG FILES...")
    tiff_cell_dict = {}
    for path in cell_images_lst:
        cell_type = os.path.basename(os.path.dirname(path))
        jpg = os.path.basename(path)
        point = re.findall(pattern, jpg)
        if not point:
            print(path)
        try:
            tiff_name, x, y, w, h, _ = point[0]
def get_cell_image(path, ctype, parent_pathes):
    """
    获取细胞文件路径
    :param path: 细胞图像路径
    :param ctype: 标注类别 MANUAL or AUTO
    :param parent_pathes: 大图名称及对应路径字典
    :return:
    """

    # 检查本地有无细胞图像文件路径信息文件
    # 如果存在,则直接读取
    # 如果没有,通过 FileScanner 工具类获取并写入本地文件

    image_path_info_dict_path = ctype + '_IMAGES_PATH_DICT.txt'
    check_name = os.path.join(METADATA_FILE_PATH, image_path_info_dict_path)

    if os.path.exists(check_name):
        with open(os.path.join(METADATA_FILE_PATH,
                               image_path_info_dict_path)) as f:
            files = [item.replace('\n', '') for item in f.readlines()]
    else:
        files = FilesScanner(path, ['.jpg']).get_files()
        with open(os.path.join(METADATA_FILE_PATH, image_path_info_dict_path),
                  'w') as o:
            o.writelines([item + '\n' for item in files])

    # 根据细胞图像文件名生成细胞坐标信息
    cells_dict = {}

    for item in files:
        if item.endswith('.jpg'):
            # 细胞图文件名
            basename = os.path.basename(item)

            parent = os.path.dirname(item)
            # 细胞所属类别
            clas_type = os.path.basename(parent)

            parent = os.path.dirname(parent)
            # 细胞所属大图名称
            parent_name = os.path.basename(parent)

            parent = os.path.dirname(parent)
            # 大图所属类别
            parent_type = os.path.basename(parent)

            # 大图原始路径
            try:
                parent_path = parent_pathes[parent_name]
            except Exception as e:
                print("CANNOT FIND RELATIVE TIFF PATH INFO, %s" % str(e))
                exit()

            # 解析坐标信息
            point = get_location_from_filename(basename)
            assert point, "THIS JPG NAME IS NOT ACCEPTED => %s" % basename

            _, x, y, w, h, _ = point

            # 修正 AGC 细胞类别
            if clas_type in AGC_CLASSES:
                clas_type = 'AGC'

            # 解析与修正大图分类
            if '_' in parent_type:
                parent_type = parent_type.split('_')[-1]

            if parent_type in AGC_CLASSES:
                parent_type = 'AGC'

            # 细胞位置及类别信息
            info = {
                'name': basename,
                'cell_type': clas_type,
                'cell_path': item,
                'parent': parent_name,
                'parent_full_name': os.path.basename(parent_path),
                'parent_type': parent_type,
                'x': x,
                'y': y,
                'w': w,
                'h': h,
            }

            if parent_name in cells_dict:
                cells_dict[parent_name].append(info)
            else:
                cells_dict[parent_name] = [info]

    # 将解析细胞数据按归属大图名称写入文件
    for key, lines in cells_dict.items():
        # 生成输出路径
        save_path = os.path.join(METADATA_FILE_PATH,
                                 ctype + '_IMAGES_PATH_DICT')
        os.makedirs(save_path, exist_ok=True)
        with open(os.path.join(save_path, key + '.txt'), 'w') as f:
            for line in lines:
                f.write(json.dumps(line) + '\n')

    return cells_dict
def generate_labelme_format_xml(csv_files_path, patch_dict, xml_save_path):
    """
    将 csv 文件内容写入 xml
    :param csv_files_path: 读取的 csv 存放目录
    :param xml_save_path: 输出的 xml 存放路径
    :return:
    """
    files = FilesScanner(csv_files_path, postfix=['.csv']).get_files()
    clas_files = [item for item in files if item.endswith('_clas.csv')]

    # 待处理 csv 文件总数
    total = len(clas_files)
    for index, file in enumerate(clas_files):
        print("Processing %s / %s %s" % (index + 1, total, os.path.basename(file)))

        with open(file) as f:
            lines = csv.reader(f)

            dict_ = {}
            next(lines, None)

            for line in lines:
                key = line[0]
                box = {
                    'name': line[3],
                    'xmin': 0 if float(line[5]) < 0 else int(float(line[5]) + 0.5),
                    'ymin': 0 if float(line[6]) < 0 else int(float(line[6]) + 0.5),
                    'xmax': 0 if float(line[7]) < 0 else int(float(line[7]) + 0.5),
                    'ymax': 0 if float(line[8]) < 0 else int(float(line[8]) + 0.5),
                }

                if key not in dict_:
                    dict_[key] = [box]
                else:
                    dict_[key].append(box)

            for key, lst in dict_.items():
                if key in patch_dict:
                    patch = patch_dict[key]
                    label = patch['label']
                    image_path = patch['path']

                    save_path = os.path.join(xml_save_path, label)
                    if not os.path.exists(save_path):
                        os.makedirs(save_path)

                    # remove duplicated cells
                    lst_ = []
                    for item in lst:
                        x, y, w, h = item['xmin'], item['ymin'], item['xmax'] - item['xmin'], item['ymax'] - item['ymin']
                        for item_ in lst_:
                            x_, y_, w_, h_ = item_['xmin'], item_['ymin'], item_['xmax'] - item_['xmin'], item_['ymax'] - item_['ymin']
                            if cal_IOU((x, y, w, h), (x_, y_, w_, h_)) > 0.8:
                                break
                        else:
                            lst_.append(item)

                    write_to_labelme_xml(lst_, os.path.join(save_path, key + '.xml'))
                    shutil.copy(image_path, save_path)
                else:
                    raise Exception("%s NOT FOUND IN DICT" % file)
    def run(self):
        print("Initial DARKNET and XCEPTION model ...")

        total = len(self.tiff_lst)
        for index, tiff in enumerate(self.tiff_lst):
            # 获取大图文件名,不带后缀
            tiff_basename, _ = os.path.splitext(os.path.basename(tiff))
            tiff_basename = tiff_basename.replace(" ", "-")
            print('Process %s / %s %s ...' % (index + 1, total, tiff_basename))

            # 检测是否已经切图并识别完成
            # 检测细胞文件夹是否已经存在,若存在直接跳过
            check_cell_path = os.path.join(self.cells_path, tiff_basename)

            if os.path.exists(check_cell_path):
                children = os.listdir(check_cell_path)
                if len(children) > 0:
                    print("%s HAS BEEN PROCESSED!" % tiff_basename)
                    continue

            # 切片文件存储路径
            slice_save_path = os.path.join(self.slice_dir_path, tiff_basename)

            t0 = datetime.datetime.now()
            # 如果路径下切图文件不存在,执行切图
            if not os.path.exists(slice_save_path):
                # 执行切图
                ImageSlice(tiff, self.slice_dir_path).get_slices()

            # 获取切图文件路径
            tif_images = FilesScanner(slice_save_path, ['.jpg']).get_files()
            t1 = datetime.datetime.now()
            print('TIFF SLICE COST: %s' % (t1 - t0))

            # CHECK IF ALREADY PROCESSED
            seg_csv = os.path.join(self.meta_files_path, tiff_basename + "_seg.csv")

            # 将细胞分割结果写入文件
            xcep_pre = XceptionPreprocess(tiff)

            if not os.path.exists(seg_csv):
                #################################### YOLO 处理 #####################################################
                tasks = []

                # 创建切图进程池
                executor = ProcessPoolExecutor(max_workers=GPU_NUM)

                if len(tif_images) < cfg.darknet.min_job_length:
                    tasks.append(executor.submit(yolo_predict, '0', tif_images))
                else:
                    # 任务切分
                    n = int((len(tif_images) / float(GPU_NUM)) + 0.5)
                    patches = [tif_images[i: i + n] for i in range(0, len(tif_images), n)]

                    for gpu_index, patch in enumerate(patches):
                        tasks.append(executor.submit(yolo_predict, str(gpu_index), patch))

                seg_results = {}
                for future in as_completed(tasks):
                    result = future.result()
                    seg_results.update(result)

                # 关闭进程池
                executor.shutdown(wait=True)

                # WRITE DATA TO CSV
                xcep_pre.write_csv(seg_results, seg_csv)

            t2 = datetime.datetime.now()
            print("DARKNET COST %s" % (t2 - t1))

            # XCEPTION preprocess
            cell_lst, cell_index = xcep_pre.gen_np_array_csv(seg_csv=seg_csv)

            ##################################### XCEPTION 处理 #################################################
            tasks = []
            # 创建切图进程池
            executor = ProcessPoolExecutor(max_workers=GPU_NUM)

            if len(cell_lst) < cfg.xception.min_job_length:
                tasks.append(executor.submit(xception_predict, '0', np.asarray(cell_lst)))
            else:
                # 任务切分
                n = int((len(cell_lst) / float(GPU_NUM)) + 0.5)
                cell_patches = [cell_lst[i: i + n] for i in range(0, len(cell_lst), n)]

                for gpu_index, patch in enumerate(cell_patches):
                    tasks.append(executor.submit(xception_predict, str(gpu_index), np.asarray(patch)))

            predictions_ = {}
            for future in as_completed(tasks):
                index, result = future.result()
                predictions_[index] = result

            predictions = []
            for i in range(len(predictions_)):
                predictions.extend(predictions_[str(i)])

            # 关闭进程池
            executor.shutdown(wait=True)

            t3 = datetime.datetime.now()
            print("XCEPTION COST %s" % (t3 - t2))

            clas = XceptionPostprocess()
            clas_dict = clas.convert_all(predictions=predictions, cell_index=cell_index)
            clas_csv = os.path.join(self.meta_files_path, tiff_basename + '_clas.csv')
            clas.write_csv(clas_dict, clas_csv)

            ############################### 生成审核图像 ######################################################
            # GET VIEW CELL IMAGES
            clas.cut_cells_p_marked(tiff, clas_dict, self.cells_path, factor=0.2, N=1)
            t4 = datetime.datetime.now()
            print("GET VIEW IMAGES COST %s" % (t4 - t3))

            print("TIFF %s TOTAL COST %s ..." % (tiff_basename, t4 - t0))
        # 中间文件存放目录
        meta_files_path = os.path.join(resource_save_path, 'test', 'META')

        # 识别出的细胞存储路径
        cells_save_path = os.path.join(resource_save_path, 'test', 'CELLS')
    else:
        # 切图文件存储路径
        slice_dir_path = os.path.join(resource_save_path, 'SLICE')

        # 中间文件存放目录
        meta_files_path = os.path.join(resource_save_path, 'META')

        # 识别出的细胞存储路径
        cells_save_path = os.path.join(resource_save_path, 'CELLS')

    tiff_lst = FilesScanner(tiff_dir_path, ['.kfb', '.tif']).get_files()

    # 执行 TIFF 文件完整性校验
    for tiff in tiff_lst:
        try:
            try:
                slide = openslide.OpenSlide(tiff)
            except:
                slide = TSlide(tiff)
        except Exception as e:
            raise Exception("%s %s" % (tiff, str(e)))

    for item in [slice_dir_path, meta_files_path, cells_save_path]:
        if not os.path.exists(item):
            os.makedirs(item)
Example #24
0
# coding: utf-8
import os

import openslide

from tslide import TSlide
from utils import FilesScanner

tiff_resource_path = ''
tiffs = FilesScanner(tiff_resource_path, ['.kfb', '.tif']).get_files()

files = {}
for item in tiffs:
    basename = os.path.basename(item)
    if item in files:
        files[item].append(item)
    else:
        files[item] = [item]

for key, lst in files.items():
    if len(lst) > 1:
        print(lst)

# for tiff in tiffs:
#     try:
#         try:
#             slide = openslide.OpenSlide(tiff)
#         except:
#             slide = TSlide(tiff)
#     except:
#         print("TIFF OPEN FAILED => \n%s" % tiff)
Example #25
0
def collect_cells_by_accuracy(path, accuracy, output):
    cell_images = FilesScanner(path, ['.jpg'])
            patch.save(os.path.join(save_path, image_name))
            #patch = cv2.cvtColor(np.asarray(patch), cv2.COLOR_RGBA2BGR)
            #cv2.imwrite(os.path.join(save_path, image_name), patch, [int(cv2.IMWRITE_JPEG_QUALITY), 95])
        except Exception as e:
            print(e)
            print(x_, y_, w_, h_)
            print(slide.dimensions)
            continue

    return None


if __name__ == '__main__':
    # xmls_path = TRAIN_DATA_SAVE_PATH
    # 获取 xml 文件路径列表
    xmls = FilesScanner(CHECKED_CELL_XML_SAVE_PATH, ['.xml']).get_files()
    # xmls = FilesScanner(SELECTED_CELL_XML_SAVE_PATH, ['.xml']).get_files()

    size = len(xmls)

    executor = ProcessPoolExecutor(max_workers=10)
    tasks = []

    tif_path = '/home/cnn/Development/DATA/TRAIN_DATA/TIFFS'
    os.makedirs(METADATA_FILE_PATH, exist_ok=True)
    tif_images_collections_path = os.path.join(METADATA_FILE_PATH,
                                               'TIFF_IMAGES_PATH_DICT.txt')
    tiff_dict = generate_name_path_dict(tif_path, ['.tif', '.kfb'],
                                        tif_images_collections_path)
    # tiff_dict = generate_name_path_dict('', ['.tif', '.kfb'])