コード例 #1
0
def find_limit(subset, data_set, items_of_interest):
    """
    Find the minimum number of similarity ratings to use as a limit.
    Look only at the given subset of the given data_set concerning the given set of items_of_interest.
    """    
    
    limit = 1000
    for idx1, item1 in enumerate(items_of_interest):
        for idx2, item2 in enumerate(items_of_interest):
            if idx2 <= idx1:
                continue
            
            tuple_id = list_to_string([item1, item2])
            if tuple_id in data_set['similarities']:
                similarity_ratings = data_set['similarities'][tuple_id]['values']
                if subset == "between":
                    # remove everything from first study
                    border = data_set['similarities'][tuple_id]['border']
                    similarity_ratings = similarity_ratings[border:]
                elif subset == "within":
                    # remove everything from second study
                    border = data_set['similarities'][tuple_id]['border']
                    similarity_ratings = similarity_ratings[:border]
                
                if len(similarity_ratings) > 0:
                    # only adapt the limit if there are any ratings left
                    limit = min(limit, len(similarity_ratings))
    return limit
コード例 #2
0
def select_data_subset(subset, data_set):
    """
    Select a subset of the given data set.
    
    The parameter 'subset' can have the following values: 'all', 'between', 'within', 'cats'.
    Returns a triple of lists: items_of_interest, item_names, and categories_of_interest.
    """
    
    category_names = data_set['category_names']
    
    # sort item IDs based on categories
    item_ids = []
    for category in category_names:
        item_ids += data_set['categories'][category]['items']
    
    if subset == "all":
        # use all the similarity ratings that we have    
        
        items_of_interest = list(item_ids)
        categories_of_interest = list(category_names)
    
    elif subset == "between":
        # only use the similarity ratings from the 'between' file
    
        items_of_interest = []   
        
        for idx1, item1 in enumerate(item_ids):
            for idx2, item2 in enumerate(item_ids):
                
                if idx2 <= idx1:
                    continue
                
                tuple_id = list_to_string([item1, item2])
                if tuple_id in data_set['similarities']:
                    border = data_set['similarities'][tuple_id]['border']
                    between_ratings = data_set['similarities'][tuple_id]['values'][border:]
                    if len(between_ratings) > 0:
                        items_of_interest.append(item1)
                        items_of_interest.append(item2)
        
        items_of_interest = list(set(items_of_interest)) # remove duplicates
        cats = list(set(map(lambda x: data_set['items'][x]['category'], items_of_interest)))
        categories_of_interest = [cat for cat in category_names if cat in cats]
    
    elif subset == "within":
        # only use the similarity ratings from the 'within' file
        items_of_interest = []   
        
        for idx1, item1 in enumerate(item_ids):
            for idx2, item2 in enumerate(item_ids):
                
                if idx2 <= idx1:
                    continue
                
                tuple_id = list_to_string([item1, item2])
                if tuple_id in data_set['similarities']:
                    border = data_set['similarities'][tuple_id]['border']
                    between_ratings = data_set['similarities'][tuple_id]['values'][:border]
                    if len(between_ratings) > 0:
                        items_of_interest.append(item1)
                        items_of_interest.append(item2)
        
        items_of_interest = list(set(items_of_interest)) # remove duplicates
        cats = list(set(map(lambda x: data_set['items'][x]['category'], items_of_interest)))
        categories_of_interest = [cat for cat in category_names if cat in cats]
        
    elif subset == "cats":
        # consider only the categories from the second study, but use all items within them
        categories_of_interest = ["buildings", "vegetables", "dishes", "insects", "street vehicles", 
                                      "fruits", "electrical appliances", "animals", 
                                      "upper body clothing", "plants", "birds", "tools"]
        items_of_interest = []
        for item in item_ids:
            if data_set['items'][item]['category'] in categories_of_interest:
                items_of_interest.append(item)
    
    # no matter which subset was used: sort the idem IDs and create a corresponding list of item names
    items = list(items_of_interest)
    items_of_interest = []
    for category in categories_of_interest:
        for item in data_set['categories'][category]['items']:
            if item in items:
                items_of_interest.append(item)
    
    return items_of_interest, categories_of_interest
コード例 #3
0
        similarity_info[str(sorted([item_map[tokens[3]], item_map[tokens[5]]]))] = {'relation': 'within', 'category_type': vis_sim_map[tokens[1]], 'values': similarity_values, 'border':len(similarity_values)}

# now read within_between category information
with open(args.within_between_file, 'r') as f_in:
    for line in f_in:
        # ignore header
        if line.startswith("Relation"):
            continue
        
        tokens = list(map(lambda x: x.replace(' ', ''), line.replace('\n', '').split(',')))
        
        # convert into readable name
        item1 = item_map[tokens[4]]
        item2 = item_map[tokens[8]]

        item_tuple_id = list_to_string([item1, item2])
        
        for item in [item1, item2]:
            # check whether the items are already known (they should be by now!)
            if item not in item_info:
                raise Exception("unknown item!")
        
        # get a list of all the similarity values (remove empty entries, then convert to int) and store them
        similarity_values = list(map(lambda x: int(x), filter(None, tokens[12:])))
        if args.reverse:
            similarity_values = list(map(lambda x: 6 - x, similarity_values))
        
        # transform information about category type
        category_type = 'Mix'
        if tokens[0] == 'within':
            if tokens[1] == 'visDis':
コード例 #4
0
    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))


with open(args.output_file, 'w', buffering=1) as f_out:

    f_out.write("n_dims,type,dims,scoring,weights,{0}\n".format(
        ','.join(correlation_metrics)))

    if args.feature_folder is not None:
        # look at the power set of all spaces
        spaces = sorted(powerset(sorted(feature_data.keys())))
    else:
        spaces = sorted(map(lambda x: x.split('-'), distances.keys()))
    for space in spaces:

        space_name = list_to_string(space)
        print(space_name)
        number_of_dimensions = len(space)
        if number_of_dimensions == 0:
            # ignore empty set
            continue

        if args.feature_folder is not None:
            largest_set_of_scale_types = []
            for feature_name in space:
                if len(feature_data[feature_name]['aggregated'].keys()) > len(
                        largest_set_of_scale_types):
                    largest_set_of_scale_types = sorted(
                        feature_data[feature_name]['aggregated'].keys())
        else:
            largest_set_of_scale_types = sorted(