def create_image_lists():
    image_dir = '/home/rick/derma/dataset'
    testing_percentage = 20

    result = {}
    counter_for_result_label = 0

    sub_dirs = [x[0] for x in gfile.Walk(image_dir)]  #create sub_dirs

    # The root directory comes first, so skip it.

    dir_name = []

    #ignore first element in sub_dir
    is_root_dir = True
    for sub_dir in sub_dirs:
        if is_root_dir:
            is_root_dir = False
            continue

        dir_name = os.path.basename(sub_dir)

        extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
        file_list = []

        tf.logging.info("Looking for images in '" + dir_name + "'")
        for extension in extensions:
            #for image_dir in sub_dir
            file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
            file_list.extend(
                gfile.Glob(file_glob))  #create a list of all files

        #using regex to set label name
        label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())

        #dividing
        training_images = []
        testing_images = []
        for file_name in file_list:
            base_name = os.path.basename(
                file_name)  #just take name of image (eg: 5547758_ed54_n)

            hash_name = re.sub(r'_nohash_.*$', '', file_name)

            hash_name_hashed = hashlib.sha1(
                compat.as_bytes(hash_name)).hexdigest()
            percentage_hash = ((int(hash_name_hashed, 16) %
                                (MAX_NUM_IMAGES_PER_CLASS + 1)) *
                               (100.0 / MAX_NUM_IMAGES_PER_CLASS))
            if percentage_hash < testing_percentage:
                #testing_images.append(file_name)
                testing_images.append(cv2.imread(file_name))
                #testing_images.append(base_name)
            else:
                #training_images.append(file_name)
                training_images.append(cv2.imread(file_name))
                #training_images.append(base_name)

        result[counter_for_result_label] = {
            'training_label':
            [counter_for_result_label] * (len(training_images)),
            'testing_label':
            [counter_for_result_label] * (len(testing_images)),
            'training': training_images,
            'testing': testing_images,
        }

        counter_for_result_label = counter_for_result_label + 1

    return result
Ejemplo n.º 2
0
def create_image_lists(image_dir, testing_percentage, validation_percentage):
  """file system으로부터 training 이미지들의 list를 만든다.
  이미지 디렉토리로부터 sub folder들을 분석하고, 그들을 training, testing, validation sets으로 나눈다.
  그리고 각각의 label을 위한 이미지 list와 그들의 경로(path)를 나타내는 자료구조(data structure)를 반환한다.
  인수들(Args):
    image_dir: 이미지들의 subfolder들을 포함한 folder의 String path.
    testing_percentage: 전체 이미지중 테스트를 위해 사용되는 비율을 나타내는 Integer.
    validation_percentage: 전체 이미지중 validation을 위해 사용되는 비율을 나타내는 Integer.
  반환값들(Returns):
    각각의 label subfolder를 위한 enrtry를 포함한 dictionary A dictionary 
    (각각의 label에서 이미지드릉ㄴ training, testing, validation sets으로 나뉘어져 있다.)
  """
  if not gfile.Exists(image_dir):
    print("Image directory '" + image_dir + "' not found.")
    return None
  result = {}
  sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
  # root directory는 처음에 온다. 따라서 이를 skip한다.
  is_root_dir = True
  for sub_dir in sub_dirs:
    if is_root_dir:
      is_root_dir = False
      continue
    extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
    file_list = []
    dir_name = os.path.basename(sub_dir)
    if dir_name == image_dir:
      continue
    print("Looking for images in '" + dir_name + "'")
    for extension in extensions:
      file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
      file_list.extend(gfile.Glob(file_glob))
    if not file_list:
      print('No files found')
      continue
    if len(file_list) < 20:
      print('WARNING: Folder has less than 20 images, which may cause issues.')
    elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
      print('WARNING: Folder {} has more than {} images. Some images will '
            'never be selected.'.format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
    label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
    training_images = []
    testing_images = []
    validation_images = []
    for file_name in file_list:
      base_name = os.path.basename(file_name)
      # 어떤 이미지로 리스트를 만들지 결정할때 파일 이름에 "_nohash_"가 포함되어 있으면 이를 무시할 수 있다.
      # 이를 이용해서, 데이터셋을 만드는 사람은 서로 비슷한 사진들을 grouping할 수있다.
      # 예를 들어, plant disease를 데이터셋을 만들기 위해서, 여러 장의 같은 잎사귀(leaf)를 grouping할 수 있다.
      hash_name = re.sub(r'_nohash_.*$', '', file_name)
      # 이는 일종의 마법처럼 보일 수 있다. 하지만, 우리는 이 파일이 training sets로 갈지, testing sets로 갈지, validation sets로 갈지 결정해야만 한다.
      # 그리고 우리는 더많은 파일들이 추가되더라도, 같은 set에 이미 존재하는 파일들이 유지되길 원한다.
      # 그렇게 하기 위해서는, 우리는 파일 이름 그자체로부터 결정하는 안정적인 방법이 있어야만 한다.
      # 따라서, 우리는 파일 이름을 hash하고, 이를 이를 할당하는데 사용하는 확률을 결정하는데 사용한다.
      hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
      percentage_hash = ((int(hash_name_hashed, 16) %
                          (MAX_NUM_IMAGES_PER_CLASS + 1)) *
                         (100.0 / MAX_NUM_IMAGES_PER_CLASS))
      if percentage_hash < validation_percentage:
        validation_images.append(base_name)
      elif percentage_hash < (testing_percentage + validation_percentage):
        testing_images.append(base_name)
      else:
        training_images.append(base_name)
    result[label_name] = {
        'dir': dir_name,
        'training': training_images,
        'testing': testing_images,
        'validation': validation_images,
    }
  return result
Ejemplo n.º 3
0
def create_image_lists(image_dir, testing_percentage, validation_percentage,
                       max_num_images_per_class):
    """Builds a list of training images from the file system.

    Analyzes the sub folders in the image directory, splits them into stable
    training, testing, and validation sets, and returns a data structure
    describing the lists of images for each label and their paths.

    Args:
      image_dir: String path to a folder containing subfolders of images.
      testing_percentage: Integer percentage of the images to reserve for tests.
      validation_percentage: Integer percentage of images reserved for validation.

    Returns:
      A dictionary containing an entry for each label subfolder, with images split
      into training, testing, and validation sets within each label.
    """
    if not gfile.Exists(image_dir):
        tf.logging.error("Image directory '" + image_dir + "' not found.")
        return None
    result = {}
    sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
    # The root directory comes first, so skip it.
    is_root_dir = True
    for sub_dir in sub_dirs:
        if is_root_dir:
            is_root_dir = False
            continue
        extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
        file_list = []
        dir_name = os.path.basename(sub_dir)
        if dir_name == image_dir:
            continue
        tf.logging.info("Looking for images in '" + dir_name + "'")
        for extension in extensions:
            file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
            file_list.extend(gfile.Glob(file_glob))
        if not file_list:
            tf.logging.warning('No files found')
            continue
        if len(file_list) < 20:
            tf.logging.warning(
                'WARNING: Folder has less than 20 images, which may cause issues.'
            )
        label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
        training_images = []
        testing_images = []
        validation_images = []
        for file_name in file_list:
            base_name = os.path.basename(file_name)
            # We want to ignore anything after '_nohash_' in the file name when
            # deciding which set to put an image in, the data set creator has a way of
            # grouping photos that are close variations of each other. For example
            # this is used in the plant disease data set to group multiple pictures of
            # the same leaf.
            hash_name = re.sub(r'_nohash_.*$', '', file_name)
            # This looks a bit magical, but we need to decide whether this file should
            # go into the training, testing, or validation sets, and we want to keep
            # existing files in the same set even if more files are subsequently
            # added.
            # To do that, we need a stable way of deciding based on just the file name
            # itself, so we do a hash of that and then use that to generate a
            # probability value that we use to assign it.
            hash_name_hashed = hashlib.sha1(
                compat.as_bytes(hash_name)).hexdigest()
            percentage_hash = ((int(hash_name_hashed, 16) %
                                (max_num_images_per_class + 1)) *
                               (100.0 / max_num_images_per_class))
            if percentage_hash < validation_percentage:
                validation_images.append(base_name)
            elif percentage_hash < (testing_percentage +
                                    validation_percentage):
                testing_images.append(base_name)
            else:
                training_images.append(base_name)
        result[label_name] = {
            'dir': dir_name,
            'training': training_images,
            'testing': testing_images,
            'validation': validation_images,
        }
    return result
Ejemplo n.º 4
0
import os.path
from tensorflow.python.platform import gfile

db = mysql.connect(host='ec2-13-124-80-232.ap-northeast-2.compute.amazonaws.com', user='******', password='******', db='forstyle', charset='utf8')

curs = db.cursor()
sql = """update back_up set product_clothes_label = ""%s"" where product_file_name = %s"""
    


image_dir =sys.argv[1]

if not gfile.Exists(image_dir):
    print("Image directory '" + image_dir + "' not found.")

sub_dirs = [x[0] for x in gfile.Walk(image_dir)]

for sub_dir in sub_dirs:
    dir_name = os.path.basename(sub_dir)
    if dir_name == image_dir:
        continue
    extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
    file_list = []

    for extension in extensions:
        file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
        file_list.extend(gfile.Glob(file_glob))

    for image_path in file_list:
        tmp = re.split('image_',image_path)
        image_name = re.split('\.',tmp[1])
Ejemplo n.º 5
0
  def __init__(self, dump_root, partition_graphs=None, validate=True):
    """DebugDumpDir constructor.

    Args:
      dump_root: Path to the dump root directory.
      partition_graphs: A repeated field of GraphDefs representing the
          partition graphs executed by the TensorFlow runtime.
      validate: Whether the dump files are to be validated against the
          partition graphs.

    Raises:
      IOError: If dump_root does not exist as a directory.
      ValueError: If the dump_root directory contains file path patterns
         that do not conform to the canonical dump file naming pattern.
    """

    if not gfile.IsDirectory(dump_root):
      raise IOError("Dump root directory %s does not exist" % dump_root)

    self._dump_root = dump_root
    self._dump_tensor_data = []
    dump_graph_file_paths = []

    # A map from node name to debug watches.
    # The key is the watched node name.
    # The value is a dictionary.
    #   Of this dictionary, the key is the watched_output_slot.
    #   The value is a set of debug ops watching this output slot.
    self._debug_watches = collections.defaultdict(
        lambda: collections.defaultdict(set))

    for root, _, files in gfile.Walk(self._dump_root):
      for f in files:
        if _is_graph_file(f):
          dump_graph_file_paths.append(os.path.join(self._dump_root, root, f))
          continue

        if f.count("_") < 3:
          raise ValueError(
              "Dump file path does not conform to the naming pattern: %s" % f)

        debug_dump_rel_path = os.path.join(
            os.path.relpath(root, self._dump_root), f)
        datum = DebugTensorDatum(self._dump_root, debug_dump_rel_path)
        self._dump_tensor_data.append(datum)

        # Attempt to load the debug watches from the tensor dump files first,
        # before loading the full set of debug watches from the partition
        # graphs as done further below.
        # This is necessary because sometimes the partition graphs may not be
        # available, e.g., when the run errors out.
        self._debug_watches[datum.node_name][datum.output_slot].add(
            datum.debug_op)

    # Sort the data by ascending timestamp.
    # This sorting order reflects the order in which the TensorFlow
    # executor processed the nodes of the graph. It is (one of many
    # possible) topological sort of the nodes. This is useful for
    # displaying tensors in the debugger frontend as well as for the use
    # case in which the user wants to find a "culprit tensor", i.e., the
    # first tensor in the graph that exhibits certain problematic
    # properties, i.e., all zero values, or bad numerical values such as
    # nan and inf.
    self._dump_tensor_data = sorted(
        self._dump_tensor_data, key=lambda x: x.timestamp)

    # Time stamp of the first tensor dump.
    if self._dump_tensor_data:
      self._t0 = self._dump_tensor_data[0].timestamp
    else:
      self._t0 = None

    # Create a map from watch key (tensor name + debug op) to
    # DebugTensorDatum item.
    # Also make a map from watch key to relative timestamp.
    # "relative" means (absolute timestamp - t0).
    self._watch_key_to_datum = {}
    self._watch_key_to_rel_time = {}
    for datum in self._dump_tensor_data:
      if datum.watch_key not in self._watch_key_to_datum:
        self._watch_key_to_datum[datum.watch_key] = [datum]
        self._watch_key_to_rel_time[datum.watch_key] = [
            datum.timestamp - self._t0
        ]
      else:
        self._watch_key_to_datum[datum.watch_key].append(datum)
        self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
                                                            self._t0)

    # Initialize partition graph-related information.
    self._partition_graphs = None
    self._node_inputs = None
    self._node_ctrl_inputs = None
    self._node_recipients = None
    self._node_ctrl_recipients = None
    self._devices = None
    self._node_devices = None
    self._node_op_types = None

    # Check the dump data against partition executor graphs.
    if partition_graphs:
      self._load_partition_graphs(partition_graphs)
    elif dump_graph_file_paths:
      # In case partition graphs are not available from arguments, load them
      # from the dump directory.
      dump_graph_defs = [
          _load_graph_def_from_event_file(dump_file_path)
          for dump_file_path in dump_graph_file_paths
      ]
      self._load_partition_graphs(dump_graph_defs)

    if (self._partition_graphs is not None) and validate:
      self._validate_dump_with_graphs()
Ejemplo n.º 6
0
def create_image_lists(image_dir, testing_percentage, validation_percentage):
  if not gfile.Exists(image_dir):
    tf.logging.error("Image directory '" + image_dir + "' not found.")
    return None

  result = {}

  sub_dirs = [x[0] for x in gfile.Walk(image_dir)]

  # The root directory comes first, so skip it.
  is_root_dir = True
  for sub_dir in sub_dirs:
    if is_root_dir:
      is_root_dir = False
      continue

    extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
    file_list = []

    dir_name = os.path.basename(sub_dir)
    if dir_name == image_dir:
      continue

    tf.logging.info("Looking for images in '" + dir_name + "'")
    for extension in extensions:
      file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
      file_list.extend(gfile.Glob(file_glob))

    if not file_list:
      tf.logging.warning('No files found')
      continue

    file_size = len(file_list)

    if file_size < 20:
      tf.logging.warning(
          'WARNING: Folder has less than 20 images, which may cause issues.')

    label_name = dir_name.lower()
    training_images = []
    testing_images = []
    validation_images = []

    random.shuffle(file_list)
    testing_idx = int(file_size * testing_percentage)
    validation_idx = int(file_size * validation_percentage)

    for index, file_name in enumerate(file_list):
      base_name = os.path.basename(file_name)

      if index < validation_idx:
        validation_images.append(base_name)
      elif index < (testing_idx + validation_idx):
        testing_images.append(base_name)
      else:
        training_images.append(base_name)

    _result = {
      'dir': dir_name,
      'training': training_images,
      'testing': testing_images,
      'validation': validation_images,
    }

    result[label_name] = _result
  return result
Ejemplo n.º 7
0
def create_image_lists(image_dir, testing_percentage, validation_percentage):
    if not gfile.Exists(image_dir):
        print("Image directory '" + image_dir + "' not found.")
        return None
    result = {}
    subFolders = [x[0] for x in gfile.Walk(image_dir)]

    is_root_dir = True
    #loop through the sub folders to get each images.
    for folder in subFolders:
        # skip the root directory.
        if is_root_dir:
            is_root_dir = False
            continue
        extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
        file_list = []
        dir_name = os.path.basename(folder)
        if dir_name == image_dir:
            continue
        print("Looking for images in '" + dir_name + "'")
        # check if the filename has valid extensions.
        for extension in extensions:
            file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
            file_list.extend(gfile.Glob(file_glob))
        if not file_list:
            print('No files found')
            continue
        if len(file_list) < 20:
            print('Each folder should contain minimum 20 images.')
        elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
            print(
                'Folder {} has more than {} images. Some images might be ignored.'
                .format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
        label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())

        #creating the dataset for training, testing and validattion.
        training_images = []
        testing_images = []
        validation_images = []
        for file_name in file_list:
            base_name = os.path.basename(file_name)

            #check in which dataset the current file should be placed based on the hashing algorithm.
            #hash digest will decide where to place the file.

            hash_name = re.sub(r'_nohash_.*$', '', file_name)
            hash_name_hashed = hashlib.sha1(
                compat.as_bytes(hash_name)).hexdigest()
            percentage_hash = ((int(hash_name_hashed, 16) %
                                (MAX_NUM_IMAGES_PER_CLASS + 1)) *
                               (100.0 / MAX_NUM_IMAGES_PER_CLASS))
            if percentage_hash < validation_percentage:
                validation_images.append(base_name)
            elif percentage_hash < (testing_percentage +
                                    validation_percentage):
                testing_images.append(base_name)
            else:
                training_images.append(base_name)
        result[label_name] = {
            'dir': dir_name,
            'training': training_images,
            'testing': testing_images,
            'validation': validation_images,
        }
    return result
Ejemplo n.º 8
0
def create_image_lists(image_dir, testing_percentage, validation_percentage):
    """이미지 디렉토리에서 인풋 데이터를 찾아 데이터로 변환한다"""
    
    ## image_dir가 존재하지 않는다면 오류 출력
    if not gfile.Exists(image_dir):
        print("Image directory '" + image_dir + "' not found.")
        return None
    
    result = {}
    
    ### image_dir 내 하위 디렉토리(label)를 가져온다 
    sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
    
    is_root_dir = True
    for sub_dir in sub_dirs:
        if is_root_dir:
            is_root_dir = False
            continue
        extensions = ['jpg', 'jpeg', 'JPG', 'JPEG', 'png', 'PNG']
        
        file_list = []
        dir_name = os.path.basename(sub_dir)
        if dir_name == image_dir:
            continue
            
        print("Looking for images in '" + dir_name + "'")
        for extension in extensions:
            file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
            file_list.extend(gfile.Glob(file_glob))
        
        ## 파일이 없거나 데이터가 작으면 예외 처리
        if not file_list:
            print('No files found')
            continue
        if len(file_list) < 20:
            print("WARNING: Folder has less than 20 images, which may cause issues.")
        elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
            print("WARNING: Folder {} has more than {} images. Some images will never be selected".format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
        label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
        
        ## 트레이닝 / 밸리데이션 / 테스트셋으로 나눈다.
        training_images = []
        testing_images = []
        validation_images = []
        for file_name in file_list:
            base_name = os.path.basename(file_name)
            
            hash_name = re.sub(r'_nohash_.*$', '', file_name)
            hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
            percentage_hash = ((int(hash_name_hashed, 16) %
                               (MAX_NUM_IMAGES_PER_CLASS + 1)) *
                              (100.0 / MAX_NUM_IMAGES_PER_CLASS))
            
            if percentage_hash < validation_percentage:
                validation_images.append(base_name)
            elif percentage_hash < (testing_percentage + validation_percentage):
                testing_images.append(base_name)
            else:
                training_images.append(base_name)
        
        result[label_name] = {
            'dir': dir_name,
            'training': training_images,
            'testing': testing_images,
            'validation': validation_images,
        }
        
    return result
Ejemplo n.º 9
0
def create_image_lists(image_dir):
    """Builds a list of training images from the file system.
    
    	Analyzes the sub folders in the image directory, splits them into stable
    	training, testing, and validation sets, and returns a data structure
    	describing the lists of images for each label and their paths.
    
    	Args:
    		image_dir: String path to a folder containing subfolders of images.
    		testing_percentage: Integer percentage of the images to reserve for tests.
    		validation_percentage: Integer percentage of images reserved for validation.
    
    	Returns:
    		A dictionary containing an entry for each label subfolder, with images split
    		into training, testing, and validation sets within each label.
    """
    #initiate folders
    folder_table = pd.DataFrame(
        columns=['Folder', 'ID', 'Code', 'Type', 'Date', 'Size', 'Slices'])
    # make sure it exist
    if not gfile.Exists(image_dir):
        print("Image directory '" + image_dir + "' not found.")
        return None
    # get the list of subjects
    sub_list = [
        image_dir + '/' + sub_dir for sub_dir in next(os.walk(image_dir))[1]
    ]
    n_subs = len(sub_list)
    print('Number of subjects:{0:d}'.format(n_subs))
    #iterate over subjects
    count_folder = 0
    count_sub = 0.0
    error_list = []
    for subs in sub_list:
        count_sub += 1
        #look into each subject
        for x in gfile.Walk(subs):
            #if we have file inside the folder
            if len(x[2]) > 0:
                # check if it contains dicom files
                n_z = len(x[2])
                if x[2][0].endswith('.dcm'):
                    try:
                        dFile = dicom.read_file(x[0] + '/' + x[2][0],
                                                force=True)
                        folder_temp = {}
                        folder_temp['Folder'] = x[0].replace(image_dir, '')
                        folder_temp['ID'] = dFile.PatientID
                        folder_temp['Code'] = dFile.SeriesNumber
                        folder_temp['Type'] = dFile.SeriesDescription
                        folder_temp['Date'] = dFile.StudyDate
                        folder_temp['Size'] = dFile.Rows
                        folder_temp['Slices'] = n_z
                        # attach folder
                        folder_table = folder_table.append(folder_temp,
                                                           ignore_index=True)
                        count_folder += 1
                        #dynamically save the result
                        if count_folder % 100 == 0:
                            folder_table.to_csv('folder_list_1.csv',
                                                index=False)
                        print('{0:d} scans detected (~{1:.2f}%)\r'.format(
                            count_folder, 100 * count_sub / n_subs)),
                    except:
                        print('error\r'),
                        error_list.append(x[0] + '/' + x[2][0])
                        continue
    print('Folder Search Completed')
    return folder_table, error_list
def create_image_lists(image_dir, testing_percentage, validation_percentage):
    if not gfile.Exists(image_dir):
        tf.logging.error("Image directory '" + image_dir + "' not found.")
        return None
    result = {}
    sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
    # The root directory comes first, so skip it.
    is_root_dir = True
    for sub_dir in sub_dirs:
        if is_root_dir:
            is_root_dir = False
            continue
        extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
        file_list = []
        dir_name = os.path.basename(sub_dir)
        if dir_name == image_dir:
            continue
        tf.logging.info("Looking for images in '" + dir_name + "'")
        for extension in extensions:
            file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
            file_list.extend(gfile.Glob(file_glob))
        if not file_list:
            tf.logging.warning('No files found')
            continue
        if len(file_list) < 20:
            tf.logging.warning(
                'WARNING: Folder has less than 20 images, which may cause issues.'
            )
        elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
            tf.logging.warning(
                'WARNING: Folder {} has more than {} images. Some images will '
                'never be selected.'.format(dir_name,
                                            MAX_NUM_IMAGES_PER_CLASS))
        label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
        training_images = []
        testing_images = []
        validation_images = []
        for file_name in file_list:
            base_name = os.path.basename(file_name)

            hash_name = re.sub(r'_nohash_.*$', '', file_name)

            hash_name_hashed = hashlib.sha1(
                compat.as_bytes(hash_name)).hexdigest()
            percentage_hash = ((int(hash_name_hashed, 16) %
                                (MAX_NUM_IMAGES_PER_CLASS + 1)) *
                               (100.0 / MAX_NUM_IMAGES_PER_CLASS))
            if percentage_hash < validation_percentage:
                validation_images.append(base_name)
            elif percentage_hash < (testing_percentage +
                                    validation_percentage):
                testing_images.append(base_name)
            else:
                training_images.append(base_name)
        result[label_name] = {
            'dir': dir_name,
            'training': training_images,
            'testing': testing_images,
            'validation': validation_images,
        }
    return result
Ejemplo n.º 11
0
def create_image_lists_from_prepared_dir(image_dir):
    """Builds a list of training images from the file system.
    and returns a data structure
  describing the lists of images for each label and their paths.

  Args:
    image_dir: String path to a folder containing subfolders of images.

  Returns:
    A dictionary containing an entry for each label subfolder, with images split
    into training, testing, and validation sets within each label.
  """
    if not gfile.Exists(image_dir):
        tf.logging.error("Image directory '" + image_dir + "' not found.")
        return None
    result = {}

    image_dir_train = image_dir + '/train_augmented/'
    image_dir_validate = image_dir + '/validate_not_augmented/'
    image_dir_train_na = image_dir + '/train_not_augmented/' #not augmented

    print(image_dir_train)

    # initialize all labels sets
    print("initializing label sets")
    sub_dirs1 = [x[0] for x in gfile.Walk(image_dir_train)]
    sub_dirs1 = sub_dirs1[1:]
    sub_dirs2 = [x[0] for x in gfile.Walk(image_dir_validate)]
    sub_dirs2 = sub_dirs2[1:]
    sub_dirs3 = [x[0] for x in gfile.Walk(image_dir_train_na)]
    sub_dirs3 = sub_dirs3[1:]

    print("count")
    print(len(sub_dirs1))
    print(len(sub_dirs2))
    print(len(sub_dirs3))

    sub_dirs = sub_dirs1 + sub_dirs2 + sub_dirs3
    print("start")
    for sub_dir in sub_dirs:
        dir_name = os.path.basename(sub_dir)
        label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())

        result[label_name] = {
            'dir': dir_name,
            'training': [],
            'testing': [],
            'validation': [],
            'training_na': []
        }



    # fill in image_lists with training images
    result = fill_in_specified_set_with_labels(image_dir_train, result, sub_dirs1, 'training')
    # fill in image_lists with validation images
    result = fill_in_specified_set_with_labels(image_dir_validate, result, sub_dirs2, 'validation')
    # fill in image_lists with all training images
    result = fill_in_specified_set_with_labels(image_dir_train_na, result, sub_dirs3, 'training_na')

    return result
Ejemplo n.º 12
0
  def _load_dumps(self, dump_root):
    """Load `DebugTensorDatum` instances from the dump root.

    Populates a list of `DebugTensorDatum` instance and sorts the list by
    ascending timestamp.

    This sorting order reflects the order in which the TensorFlow executor
    processed the nodes of the graph. It is (one of many possible) topological
    sort of the nodes. This is useful for displaying tensors in the debugger
    frontend as well as for the use case in which the user wants to find a
    "culprit tensor", i.e., the first tensor in the graph that exhibits certain
    problematic properties, i.e., all zero values, or bad numerical values such
    as nan and inf.

    In addition, creates a map from node name to debug watches. In this Map,
    the key is the watched node name; the value is a dictionary.
    Of this dictionary, the key is the watched_output_slot.

    This method attempts to load the debug watches from the tensor dump files
    first, before loading the full set of debug watches from the partition
    graphs as done later. This is necessary because sometimes the partition
    graphs may not be available, e.g., when the run errors out.

    Args:
      dump_root: (`str`) Dump root directory.
    """

    self._dump_root = dump_root
    self._dump_tensor_data = []
    self._dump_graph_file_paths = []

    self._debug_watches = collections.defaultdict(
        lambda: collections.defaultdict(set))

    for root, _, files in gfile.Walk(self._dump_root):
      for f in files:
        if f.startswith(METADATA_FILE_PREFIX):
          if _is_core_metadata_file(f):
            self._load_core_metadata(os.path.join(self._dump_root, root, f))

          if _is_graph_file(f):
            self._dump_graph_file_paths.append(
                os.path.join(self._dump_root, root, f))

          if _is_run_fetches_info_file(f):
            self._run_fetches_info = _load_log_message_from_event_file(
                os.path.join(root, f))

          if _is_run_feed_keys_info_file(f):
            self._run_feed_keys_info = _load_log_message_from_event_file(
                os.path.join(root, f))

          continue

        datum = self._dump_file_name_to_datum(root, f)
        self._dump_tensor_data.append(datum)

        self._debug_watches[datum.node_name][datum.output_slot].add(
            datum.debug_op)

    self._dump_tensor_data = sorted(
        self._dump_tensor_data, key=lambda x: x.extended_timestamp)

    if self._dump_tensor_data:
      self._t0 = self._dump_tensor_data[0].timestamp
    else:
      self._t0 = None