def find_limit(subset, data_set, items_of_interest): """ Find the minimum number of similarity ratings to use as a limit. Look only at the given subset of the given data_set concerning the given set of items_of_interest. """ limit = 1000 for idx1, item1 in enumerate(items_of_interest): for idx2, item2 in enumerate(items_of_interest): if idx2 <= idx1: continue tuple_id = list_to_string([item1, item2]) if tuple_id in data_set['similarities']: similarity_ratings = data_set['similarities'][tuple_id]['values'] if subset == "between": # remove everything from first study border = data_set['similarities'][tuple_id]['border'] similarity_ratings = similarity_ratings[border:] elif subset == "within": # remove everything from second study border = data_set['similarities'][tuple_id]['border'] similarity_ratings = similarity_ratings[:border] if len(similarity_ratings) > 0: # only adapt the limit if there are any ratings left limit = min(limit, len(similarity_ratings)) return limit
def select_data_subset(subset, data_set): """ Select a subset of the given data set. The parameter 'subset' can have the following values: 'all', 'between', 'within', 'cats'. Returns a triple of lists: items_of_interest, item_names, and categories_of_interest. """ category_names = data_set['category_names'] # sort item IDs based on categories item_ids = [] for category in category_names: item_ids += data_set['categories'][category]['items'] if subset == "all": # use all the similarity ratings that we have items_of_interest = list(item_ids) categories_of_interest = list(category_names) elif subset == "between": # only use the similarity ratings from the 'between' file items_of_interest = [] for idx1, item1 in enumerate(item_ids): for idx2, item2 in enumerate(item_ids): if idx2 <= idx1: continue tuple_id = list_to_string([item1, item2]) if tuple_id in data_set['similarities']: border = data_set['similarities'][tuple_id]['border'] between_ratings = data_set['similarities'][tuple_id]['values'][border:] if len(between_ratings) > 0: items_of_interest.append(item1) items_of_interest.append(item2) items_of_interest = list(set(items_of_interest)) # remove duplicates cats = list(set(map(lambda x: data_set['items'][x]['category'], items_of_interest))) categories_of_interest = [cat for cat in category_names if cat in cats] elif subset == "within": # only use the similarity ratings from the 'within' file items_of_interest = [] for idx1, item1 in enumerate(item_ids): for idx2, item2 in enumerate(item_ids): if idx2 <= idx1: continue tuple_id = list_to_string([item1, item2]) if tuple_id in data_set['similarities']: border = data_set['similarities'][tuple_id]['border'] between_ratings = data_set['similarities'][tuple_id]['values'][:border] if len(between_ratings) > 0: items_of_interest.append(item1) items_of_interest.append(item2) items_of_interest = list(set(items_of_interest)) # remove duplicates cats = list(set(map(lambda x: data_set['items'][x]['category'], items_of_interest))) categories_of_interest = [cat for cat in category_names if cat in cats] elif subset == "cats": # consider only the categories from the second study, but use all items within them categories_of_interest = ["buildings", "vegetables", "dishes", "insects", "street vehicles", "fruits", "electrical appliances", "animals", "upper body clothing", "plants", "birds", "tools"] items_of_interest = [] for item in item_ids: if data_set['items'][item]['category'] in categories_of_interest: items_of_interest.append(item) # no matter which subset was used: sort the idem IDs and create a corresponding list of item names items = list(items_of_interest) items_of_interest = [] for category in categories_of_interest: for item in data_set['categories'][category]['items']: if item in items: items_of_interest.append(item) return items_of_interest, categories_of_interest
similarity_info[str(sorted([item_map[tokens[3]], item_map[tokens[5]]]))] = {'relation': 'within', 'category_type': vis_sim_map[tokens[1]], 'values': similarity_values, 'border':len(similarity_values)} # now read within_between category information with open(args.within_between_file, 'r') as f_in: for line in f_in: # ignore header if line.startswith("Relation"): continue tokens = list(map(lambda x: x.replace(' ', ''), line.replace('\n', '').split(','))) # convert into readable name item1 = item_map[tokens[4]] item2 = item_map[tokens[8]] item_tuple_id = list_to_string([item1, item2]) for item in [item1, item2]: # check whether the items are already known (they should be by now!) if item not in item_info: raise Exception("unknown item!") # get a list of all the similarity values (remove empty entries, then convert to int) and store them similarity_values = list(map(lambda x: int(x), filter(None, tokens[12:]))) if args.reverse: similarity_values = list(map(lambda x: 6 - x, similarity_values)) # transform information about category type category_type = 'Mix' if tokens[0] == 'within': if tokens[1] == 'visDis':
return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1)) with open(args.output_file, 'w', buffering=1) as f_out: f_out.write("n_dims,type,dims,scoring,weights,{0}\n".format( ','.join(correlation_metrics))) if args.feature_folder is not None: # look at the power set of all spaces spaces = sorted(powerset(sorted(feature_data.keys()))) else: spaces = sorted(map(lambda x: x.split('-'), distances.keys())) for space in spaces: space_name = list_to_string(space) print(space_name) number_of_dimensions = len(space) if number_of_dimensions == 0: # ignore empty set continue if args.feature_folder is not None: largest_set_of_scale_types = [] for feature_name in space: if len(feature_data[feature_name]['aggregated'].keys()) > len( largest_set_of_scale_types): largest_set_of_scale_types = sorted( feature_data[feature_name]['aggregated'].keys()) else: largest_set_of_scale_types = sorted(