Beispiel #1
0
def get_or_train_cav(concepts,
                     bottleneck,
                     acts,
                     cav_dir=None,
                     cav_hparams=None,
                     overwrite=False):
    """Gets, creating and training if necessary, the specified CAV.

  Assumes the activations already exists.

  Args:
    concepts: set of concepts used for CAV
            Note: if there are two concepts, provide the positive concept
                  first, then negative concept (e.g., ['striped', 'random500_1']
    bottleneck: the bottleneck used for CAV
    acts: dictionary contains activations of concepts in each bottlenecks
          e.g., acts[concept][bottleneck]
    cav_dir: a directory to store the results.
    cav_hparams: a parameter used to learn CAV
    overwrite: if set to True overwrite any saved CAV files.

  Returns:
    returns a CAV instance
  """

    if cav_hparams is None:
        cav_hparams = CAV.default_hparams()

    if cav_dir is not None:
        utils.make_dir_if_not_exists(cav_dir)
        cav_path = os.path.join(
            cav_dir,
            CAV.cav_key(concepts, bottleneck, cav_hparams.model_type,
                        cav_hparams.alpha).replace('/', '.') + '.pkl')

        if not overwrite and os.path.exists(cav_path):
            try:
                tf.logging.debug('CAV already exists: {}'.format(cav_path))
                cav_instance = CAV.load_cav(cav_path)
                cav_instance.cavs /= np.linalg.norm(
                    cav_instance.cavs)  # normalize
                tf.logging.info('Loaded CAV accuracies: {}'.format(
                    cav_instance.accuracies))
                return cav_instance
            except:
                tf.logging.info('Fail loading CAV. Now calculating...')

    tf.logging.debug('Training CAV {} - {} alpha {}'.format(
        concepts, bottleneck, cav_hparams.alpha))
    cav_instance = CAV(concepts, bottleneck, cav_hparams, cav_path)
    cav_instance.train({c: acts[c] for c in concepts})
    cav_instance.cavs /= np.linalg.norm(cav_instance.cavs)  # normalize
    tf.logging.info('CAV accuracies: {}'.format(cav_instance.accuracies))
    return cav_instance
Beispiel #2
0
 def make_directories(self):
     self.working_dir = os.path.join(self.project_dir,
                                     "tmp/{}".format(self.model_to_run))
     self.activation_dir = os.path.join(self.working_dir, 'activations/')
     self.cav_dir = os.path.join(self.working_dir, 'cavs/')
     self.source_dir = os.path.join(self.project_dir, "source_dir")
     print(self.working_dir)
     print(self.activation_dir)
     print(self.cav_dir)
     print(self.source_dir)
     utils.make_dir_if_not_exists(self.activation_dir)
     utils.make_dir_if_not_exists(self.working_dir)
     utils.make_dir_if_not_exists(self.cav_dir)
Beispiel #3
0
    # Name of parent directory that results are stored
    project_name = 'tcav_class_test'
    working_dir = 'D:\\tcav_working_dir'

    # Where activations are stored
    activation_dir = working_dir + '/activations/'

    # Where CAVs are stored
    cav_dir = working_dir + '/cavs/'

    # Where the images live
    source_dir = 'D:\\tcav_data_file'
    # bottlenecks = ['layer1', 'layer2', 'layer3']  # @param # Reduced to save time
    bottlenecks = ['layer2']

    utils.make_dir_if_not_exists(working_dir)
    utils.make_dir_if_not_exists(activation_dir)
    utils.make_dir_if_not_exists(cav_dir)

    LABEL_PATH = './imagenet_comp_graph_label_strings.txt'

    # This is a regularizer penalty parameter for linear classifier to get CAVs
    alphas = [0.1]

    target = 'cat'
    # concepts = ["dotted", "striped", "zigzagged"] # Reduced to save time
    concepts = ['striped']
    random_counterpart = 'random500_500'
    my_model = model.SmallResNet50Wrapper(LABEL_PATH)
    act_generator = activation_generator.ImageActivationGenerator(
        my_model, source_dir, activation_dir, max_examples=40)
def run_tcav(model, dataset, previous_tcav_dict=None):

    dataset_name = dataset.dataset_name
    id_to_labels = dataset.id_to_label

    model_to_run = 'inception_v3'
    tcav_dir = "models/tensorflow_inception_v3_tcav_temps"

    # where activations are stored (only if your act_gen_wrapper does so)
    activation_dir = os.path.join(tcav_dir, 'activations/')
    # where CAVs are stored.
    # You can say None if you don't wish to store any.
    cav_dir = os.path.join(tcav_dir, 'cavs/')

    concept_directory = "datasets/tcav_concepts"
    target_directory = "datasets/image_ILSVRC2012_validation"
    bottlenecks = ['Mixed_5d', 'Mixed_7c']  # @param

    utils.make_dir_if_not_exists(activation_dir)
    utils.make_dir_if_not_exists(cav_dir)

    # this is a regularizer penalty parameter for linear classifier to get CAVs.
    alphas = [0.1]
    # a folder that random images are stored
    random_counterpart = 'random_images'
    #targets = random.sample(id_to_labels.keys(), 50)
    targets = [286, 370, 757, 595, 147, 108, 478, 517, 334, 173, 948, 727, 23]
    if -1 in targets:
        targets.remove(-1)
    print(targets)

    concepts = [
        dI for dI in os.listdir(concept_directory)
        if os.path.isdir(os.path.join(concept_directory, dI))
        and "random" not in dI and "." not in dI
    ]

    the_model = TCAVInceptionWrapperSlim(model.session, model, id_to_labels)

    act_generator = act_gen.ImageActivationGenerator(
        the_model,
        concept_directory,
        activation_dir,
        max_examples=100,
        target_dir=target_directory,
        label_to_element_dict=dataset.label_to_elements)

    tf.logging.set_verbosity(0)

    if previous_tcav_dict == None:
        tcav_dict = previous_tcav_dict
    else:
        tcav_dict = {}

    for target in targets:
        mytcav = TCAV(model.session,
                      target,
                      concepts,
                      bottlenecks,
                      act_generator,
                      alphas,
                      random_counterpart,
                      cav_dir=cav_dir,
                      num_random_exp=19,
                      use_numeric_class_label=True)

        results = mytcav.run(run_parallel=True, num_workers=5)

        tcav_dict = utils.print_results(results,
                                        class_id=target,
                                        result_dict=tcav_dict)

        tcav_file_path = os.path.join(
            "models",
            dataset_name + model.model_name + '-tcavscores-2' + '.pkl')
        with open(tcav_file_path, 'wb') as f:
            pickle.dump(tcav_dict, f, pickle.HIGHEST_PROTOCOL)

    return tcav_dict
def run_tcav():
    model = get_model_list("../../models/")[0]
    dataset = get_dataset_list("../../datasets")[0]

    dataset_name = dataset.dataset_name
    id_to_labels = dataset.id_to_label

    model_to_run = 'inception_v3'
    tcav_dir = "../../models/tensorflow_inception_v3"

    # where activations are stored (only if your act_gen_wrapper does so)
    activation_dir = os.path.join(tcav_dir, 'activations/')
    # where CAVs are stored.
    # You can say None if you don't wish to store any.
    cav_dir = os.path.join(tcav_dir, 'cavs/')

    concept_directory = "../../datasets/tcav_concepts"
    target_directory = "../../datasets/targets"
    bottlenecks = ['Mixed_5d']  # @param

    utils.make_dir_if_not_exists(activation_dir)
    utils.make_dir_if_not_exists(cav_dir)

    # this is a regularizer penalty parameter for linear classifier to get CAVs.
    alphas = [0.1]
    # a folder that random images are stored
    random_counterpart = 'random_images'
    targets = ['zebra']
    concepts = ["dotted", "striped", "zigzagged", "irregular pattern", "gradient", "single color"]

    #crawl images for concepts and target class
    for concept in concepts:
        if not os.path.isdir(os.path.join(concept_directory, concept)):
            image_crawler.crawl_images(concept_directory, concept, N=50)
        # if not os.path.isdir(os.path.join(concept_directory, random_counterpart)):
        #    image_crawler.crawl_images(concept_directory, 'image', N=500)
    for target in targets:
        if not os.path.isdir(os.path.join(target_directory, target)):
            image_crawler.crawl_images(target_directory, target, N=50)

    the_model = cm.InceptionV3Wrapper_custom(model.session,
                                             model,
                                             id_to_labels)

    act_generator = act_gen.ImageActivationGenerator(the_model, concept_directory, activation_dir, max_examples=100)

    tf.logging.set_verbosity(0)

    tcav_dict = {}

    for target in targets:
        mytcav = TCAV(model.session,
                      target,
                      concepts,
                      bottlenecks,
                      act_generator,
                      alphas,
                      random_counterpart,
                      cav_dir=cav_dir,
                      num_random_exp=5)

        results = mytcav.run()

        # we have to subtract 1 from the target class, as it corresponds with our ground truth labels,
        # internally the network outputs are shifted by one, as 0 represents the background class instead of -1
        summary = utils.print_results(results, class_id=the_model.label_to_id(target)-1, result_dict=tcav_dict)

    tcav_file_path = os.path.join(model.model_path, dataset_name + model.model_name + '-tcavscores' + '.pkl')
    with open(tcav_file_path, 'wb') as f:
        pickle.dump(tcav_dict, f, pickle.HIGHEST_PROTOCOL)
Beispiel #6
0
def create_tcav_dirs(dataset=None,
                     model=None,
                     concept_dir="datasets/tcav_concepts",
                     tcav_dir=None,
                     nr_of_random_experiments=12,
                     concepts=[],
                     nr_imgs_per_concept=50,
                     nr_random_images=500):

    # if no custom tcav dir is given, create one in the model directory
    if tcav_dir == None:
        tcav_dir = os.join(model.model_path, "tcav/")
    # where activations are stored (only if your act_gen_wrapper does so)
    activation_dir = os.path.join(tcav_dir, 'activations/')
    # where CAVs are stored.
    # You can say None if you don't wish to store any.
    cav_dir = os.path.join(tcav_dir, 'cavs/')

    utils.make_dir_if_not_exists(activation_dir)
    utils.make_dir_if_not_exists(cav_dir)

    # Folders that random iamges are stored in. These concept names are searched for Google image
    # crawling, so choosing general/random terms lead to wide variety of images without preference
    # for one concept
    random_counterparts = ['random_images', 'photo', 'image']

    crawl_and_process_concepts(concept_dir, concepts, random_counterparts,
                               nr_imgs_per_concept, nr_random_images)
    '''
        Create directories for random experiments. For each tcav random experiement, we
        need on folder with random images. These directories are filled with 1/3 random
        images from the random_counterpart folder and 2/3 are sampled from all concept
        directories. This leads to random counterparts that do not favour one specific
        concept.
    '''
    # name schema of the random concept directories, 'random500_' is the default
    random_concept = "random500_"

    other_concept_file_list = []
    random_file_list = []

    for dir in os.walk(concept_dir):
        if dir[0] == concept_dir:
            continue
        if dir[0] in random_counterparts:
            random_file_list += [os.path.join(dir[0], b) for b in dir[2]]
        if "." in dir[0]:
            continue
        else:
            other_concept_file_list += [
                os.path.join(dir[0], b) for b in dir[2]
            ]

    for index in range(0, nr_of_random_experiments):
        random_out_dir = os.path.join(concept_dir, random_concept + str(index))
        os.makedirs(random_out_dir)
        file_list = []
        file_list += random.sample(random_file_list, 33)
        file_list += random.sample(other_concept_file_list, 67)
        for infile in file_list:
            if infile != '.DS_Store':
                outfile = "-".join(infile.split("/")[2:])
                im = Image.open(os.path.join(infile))
                if im.mode not in ('RGBA', 'LA', 'P'):
                    im.save(os.path.join(random_out_dir, outfile), "JPEG")
Beispiel #7
0
def save_results(results, result_dir, name, random_counterpart=None, random_concepts=None, num_random_exp=100,
    min_p_val=0.05):
  """Helper function to organize results.
  When run in a notebook, outputs a matplotlib bar plot of the
  TCAV scores for all bottlenecks for each concept, replacing the
  bars with asterisks when the TCAV score is not statistically significant.
  If you ran TCAV with a random_counterpart, supply it here, otherwise supply random_concepts.
  If you get unexpected output, make sure you are using the correct keywords.

  Args:
    results: dictionary of results from TCAV runs.
    random_counterpart: name of the random_counterpart used, if it was used. 
    random_concepts: list of random experiments that were run. 
    num_random_exp: number of random experiments that were run.
    min_p_val: minimum p value for statistical significance
  """

  utils.make_dir_if_not_exists(result_dir)

  result_json = {
    'concepts': []
  }

  # helper function, returns if this is a random concept
  def is_random_concept(concept):
    if random_counterpart:
      return random_counterpart == concept
    
    elif random_concepts:
      return concept in random_concepts

    else:
      return 'random500_' in concept

  # print class, it will be the same for all
  print("Class =", results[0]['target_class'])
  result_json['class'] = results[0]['target_class']

  # prepare data
  # dict with keys of concepts containing dict with bottlenecks
  result_summary = {}
    
  # random
  random_i_ups = {}
    
  for result in results:
    if result['cav_concept'] not in result_summary:
      result_summary[result['cav_concept']] = {}
    
    if result['bottleneck'] not in result_summary[result['cav_concept']]:
      result_summary[result['cav_concept']][result['bottleneck']] = []
    
    result_summary[result['cav_concept']][result['bottleneck']].append(result)

    # store random
    if is_random_concept(result['cav_concept']):
      if result['bottleneck'] not in random_i_ups:
        random_i_ups[result['bottleneck']] = []
        
      random_i_ups[result['bottleneck']].append(result['i_up'])
    
  # to plot, must massage data again 
  plot_data = {}
  plot_concepts = []
    
  # print concepts and classes with indentation
  for concept in result_summary:
        
    # if not random
    if not is_random_concept(concept):
      print(" ", "Concept =", concept)
      concept_json = {
        'name': concept,
        'bottlenecks': []
      }
      plot_concepts.append(concept)

      for bottleneck in result_summary[concept]:
        i_ups = [item['i_up'] for item in result_summary[concept][bottleneck]]
        
        # Calculate statistical significance
        _, p_val = ttest_ind(random_i_ups[bottleneck], i_ups)
                  
        if bottleneck not in plot_data:
          plot_data[bottleneck] = {'bn_vals': [], 'bn_stds': [], 'significant': []}

        if p_val > min_p_val:
          # statistically insignificant
          plot_data[bottleneck]['bn_vals'].append(0.01)
          plot_data[bottleneck]['bn_stds'].append(0)
          plot_data[bottleneck]['significant'].append(False)
            
        else:
          plot_data[bottleneck]['bn_vals'].append(np.mean(i_ups))
          plot_data[bottleneck]['bn_stds'].append(np.std(i_ups))
          plot_data[bottleneck]['significant'].append(True)

        print(3 * " ", "Bottleneck =", ("%s. TCAV Score = %.2f (+- %.2f), "
            "random was %.2f (+- %.2f). p-val = %.3f (%s)") % (
            bottleneck, np.mean(i_ups), np.std(i_ups),
            np.mean(random_i_ups[bottleneck]),
            np.std(random_i_ups[bottleneck]), p_val,
            "not significant" if p_val > min_p_val else "significant"))

        concept_json['bottlenecks'].append({
          'name': bottleneck,
          'tcav_score_mean': np.mean(i_ups),
          'tcav_score_std': np.std(i_ups),
          'random_score_mean': np.mean(random_i_ups[bottleneck]),
          'random_score_std': np.std(random_i_ups[bottleneck]),
          'p_val': p_val,
          'significant': not p_val > min_p_val
        })
      result_json['concepts'].append(concept_json)

        
  # subtract number of random experiments
  if random_counterpart:
    num_concepts = len(result_summary) - 1
  elif random_concepts:
    num_concepts = len(result_summary) - len(random_concepts)
  else: 
    num_concepts = len(result_summary) - num_random_exp
    
  num_bottlenecks = len(plot_data)
  bar_width = 0.35
    
  # create location for each bar. scale by an appropriate factor to ensure 
  # the final plot doesn't have any parts overlapping
  index = np.arange(num_concepts) * bar_width * (num_bottlenecks + 1)

  # matplotlib
  fig, ax = plt.subplots()
    
  # draw all bottlenecks individually
  for i, [bn, vals] in enumerate(plot_data.items()):
    bar = ax.bar(index + i * bar_width, vals['bn_vals'],
        bar_width, yerr=vals['bn_stds'], label=bn)
    
    # draw stars to mark bars that are stastically insignificant to 
    # show them as different from others
    for j, significant in enumerate(vals['significant']):
      if not significant:
        ax.text(index[j] + i * bar_width - 0.1, 0.01, "*",
            fontdict = {'weight': 'bold', 'size': 16,
            'color': bar.patches[0].get_facecolor()})
  print (plot_data)
  result_json['plot_data'] = plot_data
  with open(os.path.join(result_dir, 'result.json'), 'w') as outfile:
    json.dump(result_json, outfile, indent=4)
  # set properties
  ax.set_title('TCAV Scores for each concept and bottleneck')
  ax.set_ylabel('TCAV Score')
  ax.set_xticks(index + num_bottlenecks * bar_width / 2)
  ax.set_xticklabels(plot_concepts)
  ax.legend()
  fig.tight_layout()
  plt.savefig(os.path.join(result_dir, '{}.png'.format(name)))