def initial_result_task5(vectors, t, query_gesture): data_file_name = vectors task1.call_task1("outputs/", "tf_idf", "pca", 10) task3.call_task3("tf_idf", "outputs/", "pca", 4, "svd", "False") similarity_matrix_file_name = "outputs/similarity_matrix_pca.csv" data_matrix = np.array( pd.read_csv(data_file_name, header=None, low_memory=False)) query_gesture_row_index = np.where(data_matrix == query_gesture)[0][0] graph_degree = 10 similarity_matrix = np.array( pd.read_csv(similarity_matrix_file_name, header=None)) column_file_map = similarity_matrix[0][1:].tolist( ) # give a column number, return file name name_column_map = dict() # give a filename, returns the row index for index, filename in enumerate(column_file_map): name_column_map[filename] = index adjacency_graph = np.array(similarity_matrix[1:, 1:].tolist(), dtype=float) adjacency_graph = adjacency_graph * (adjacency_graph >= np.sort( adjacency_graph, axis=1)[:, [-graph_degree]]) normalized_adjacency_graph = sklearn.preprocessing.normalize( adjacency_graph, norm='l1', axis=0) restart_vector = np.zeros((len(adjacency_graph), 1)) restart_vector[query_gesture_row_index][0] = 1 ppr_vector = ppr(normalized_adjacency_graph, restart_vector) sorted_list = sorted(zip(ppr_vector, range(len(ppr_vector))), key=lambda v: v[0], reverse=True) dominant_feature_indices = [] for (s, i) in sorted_list: dominant_feature_indices.append(i) dominant_feature_indices = dominant_feature_indices[:t] # print("indices",dominant_feature_indices) dominant_features = [column_file_map[i] for i in dominant_feature_indices] print("Dominant features ", dominant_features) return dominant_features
def get_updated_gestures_task5(relevant_gestures, irrelevant_gestures, t, query_gesture): print(query_gesture) data_file_name = "outputs/tf_idf_pca_vectors.csv" similarity_matrix_file_name = "outputs/similarity_matrix_pca.csv" data_matrix = np.array(pd.read_csv(data_file_name, header=None)) query_gesture_row_index = np.where(data_matrix == query_gesture)[0][0] graph_degree = 10 task3.call_task3("tf_idf", "outputs/", "pca", 4, "svd", "False") relevant_gesture_row_indices = [] irrelevant_gestures_vector = np.zeros((1, len(data_matrix[0]) - 1), dtype=object) relevant_gestures_vector = np.zeros((1, len(data_matrix[0]) - 1), dtype=object) query_gesture_values = data_matrix[query_gesture_row_index, 1:].astype(np.float) if relevant_gestures: for gesture in relevant_gestures: gesture_row_index = np.where(data_matrix == gesture)[0][0] relevant_gestures_vector = np.add( relevant_gestures_vector, data_matrix[gesture_row_index, 1:].astype(np.float)) relevant_gesture_row_indices.append(gesture_row_index) relevant_gestures_vector = ( 1 / (len(relevant_gestures))) * relevant_gestures_vector if irrelevant_gestures: for gesture in irrelevant_gestures: # print(np.where(data_matrix == gesture)[0]) gesture_row_index = np.where(data_matrix == gesture)[0][0] irrelevant_gestures_vector = np.add( irrelevant_gestures_vector, data_matrix[gesture_row_index, 1:].astype(np.float)) irrelevant_gestures_vector = ( -1 / len(irrelevant_gestures)) * irrelevant_gestures_vector data_matrix[query_gesture_row_index, 1:] = np.add(query_gesture_values, relevant_gestures_vector, irrelevant_gestures_vector) pd.DataFrame(data_matrix).to_csv(data_file_name, header=None, index=None) similarity_matrix = np.array( pd.read_csv(similarity_matrix_file_name, header=None)) column_file_map = similarity_matrix[0][1:].tolist( ) # give a column number, return file name name_column_map = dict() # give a filename, returns the row index for index, filename in enumerate(column_file_map): name_column_map[filename] = index adjacency_graph = np.array(similarity_matrix[1:, 1:].tolist(), dtype=float) adjacency_graph = adjacency_graph * (adjacency_graph >= np.sort( adjacency_graph, axis=1)[:, [-graph_degree]]) normalized_adjacency_graph = sklearn.preprocessing.normalize( adjacency_graph, norm='l1', axis=0) restart_vector = np.zeros((len(adjacency_graph), 1)) restart_vector[query_gesture_row_index][0] = 1 for i in relevant_gesture_row_indices: restart_vector[i][0] = 1 ppr_vector = ppr(normalized_adjacency_graph, restart_vector) sorted_list = sorted(zip(ppr_vector, range(len(ppr_vector))), key=lambda v: v[0], reverse=True) dominant_feature_indices = [] for (s, i) in sorted_list: dominant_feature_indices.append(i) dominant_feature_indices = dominant_feature_indices[:t] dominant_features = [column_file_map[i] for i in dominant_feature_indices] # print("Dominant features ", dominant_features) return dominant_features
def ppr_2(labels_train, vector_model, output_dir, user_option, custom_cost, k): labels_train_dict = {} for each in labels_train.tolist(): labels_train_dict[str(each[0]) + "_words.csv"] = each[1] task3.call_task3( vector_model, output_dir, user_option, 4, "svd", custom_cost) # construct gesture_gesture_similarity matrix similarity_matrix_df = pd.read_csv(output_dir + "similarity_matrix_" + user_option + ".csv", header=None, low_memory=False) similarity_matrix = np.array(similarity_matrix_df) column_file_map = similarity_matrix[0][1:].tolist( ) # give a column number, return file name name_column_map = dict() # give a filename, returns the row index for index, filename in enumerate(column_file_map): name_column_map[filename] = index adjacency_graph = np.array(similarity_matrix[1:, 1:].tolist(), dtype=float) adjacency_graph = adjacency_graph * (adjacency_graph >= np.sort( adjacency_graph, axis=1)[:, [-k]]) normalized_adjacency_graph = sklearn.preprocessing.normalize( adjacency_graph, norm='l1', axis=0) vector_size = len(adjacency_graph) # print(adjacency_graph) class_set = list(set(labels_train_dict.values())) classlist = [0] * len(class_set) # label_map = {2: "vattene", 1: "combinato", 0: "daccordo"} label_map = {} for index in range(len(class_set)): classlist[index] = [] for i, c in enumerate(class_set): for k, v in labels_train_dict.items(): if (c == v): label_map[i] = c classlist[i].append(k) class1 = classlist[0] class2 = classlist[1] class3 = classlist[2] restart_vector_class1 = np.zeros((vector_size, 1)) for f in class1: column = name_column_map[f] restart_vector_class1[column][0] = 1 / len(class1) ppr_vector_class1 = ppr(normalized_adjacency_graph, restart_vector_class1) restart_vector_class2 = np.zeros((vector_size, 1)) for f in class2: column = name_column_map[f] restart_vector_class2[column][0] = 1 / len(class2) ppr_vector_class2 = ppr(normalized_adjacency_graph, restart_vector_class2) restart_vector_class3 = np.zeros((vector_size, 1)) for f in class3: column = name_column_map[f] restart_vector_class3[column][0] = 1 / len(class3) ppr_vector_class3 = ppr(normalized_adjacency_graph, restart_vector_class3) output_file = open(output_dir + "ppr_2_classification.csv", "w") labelled_gestures = [x for x in labels_train_dict.keys()] labelled_gesture_columns = [ name_column_map[x] + 1 for x in labelled_gestures ] unlabelled_gestures = list(set(column_file_map) - set(labelled_gestures)) unlabelled_gesture_columns = [ name_column_map[x] + 1 for x in unlabelled_gestures ] csv_write = csv.writer(output_file) for c_index, c in enumerate(unlabelled_gesture_columns): # print(column_file_map[c - 1]) # column_file_map[c - 1]="575_words.csv" user_specified_column = name_column_map[column_file_map[c - 1]] scores = [ ppr_vector_class1[user_specified_column][0], ppr_vector_class2[user_specified_column][0], ppr_vector_class3[user_specified_column][0] ] label = scores.index(max(scores)) # print(scores,column_file_map[c-1],label_map[label]) csv_write.writerow( (unlabelled_gestures[c_index].replace("_words.csv", ""), label_map[label]))
adjacency_graph = np.array(similarity_matrix[1:, 1:].tolist(), dtype=float) adjacency_graph = adjacency_graph * (adjacency_graph >= np.sort( adjacency_graph, axis=1)[:, [-graph_degree]]) normalized_adjacency_graph = sklearn.preprocessing.normalize( adjacency_graph, norm='l1', axis=0) restart_vector = np.zeros((len(adjacency_graph), 1)) restart_vector[query_gesture_row_index][0] = 1 for i in relevant_gesture_row_indices: # print(i) restart_vector[i][0] = 1 relevant_gesture_row_indices = [] ppr_vector = ppr(normalized_adjacency_graph, restart_vector) sorted_list = sorted(zip(ppr_vector, range(len(ppr_vector))), key=lambda v: v[0], reverse=True) dominant_feature_indices = [] for (s, i) in sorted_list: dominant_feature_indices.append(i) dominant_feature_indices = dominant_feature_indices[:args.t] dominant_features = [ column_file_map[i].replace("_words.csv", "") for i in dominant_feature_indices ] print("Dominant features ", dominant_features)
def ppr_classifier(labels_train, vector_model, output_dir, user_option, custom_cost, k): labels_train_dict = {} for each in labels_train.tolist(): labels_train_dict[str(each[0]) + "_words.csv"] = each[1] output_file = open(output_dir + "ppr_classification.csv", "w") number_of_dominant_features = 10 task3.call_task3( vector_model, output_dir, user_option, 4, "svd", custom_cost) # construct gesture_gesture_similarity matrix similarity_matrix_df = pd.read_csv(output_dir + "similarity_matrix_" + user_option + ".csv", header=None, low_memory=False) similarity_matrix = np.array(similarity_matrix_df) column_file_map = similarity_matrix[0][1:].tolist( ) # give a column number, return file name name_column_map = dict() # give a filename, returns the row index for index, filename in enumerate(column_file_map): name_column_map[filename] = index # ---------------------------------------run ppr for each unlabelled data --------------------------------- labelled_gestures = [x for x in labels_train_dict.keys()] labelled_gesture_columns = [ name_column_map[x] + 1 for x in labelled_gestures ] unlabelled_gestures = list(set(column_file_map) - set(labelled_gestures)) unlabelled_gesture_columns = [ name_column_map[x] + 1 for x in unlabelled_gestures ] csv_write = csv.writer(output_file) for c_index, c in enumerate(unlabelled_gesture_columns): similarity_matrix_df = pd.read_csv(output_dir + "similarity_matrix_" + user_option + ".csv", low_memory=False) matrix_columns = labelled_gesture_columns + [c] #print("file",column_file_map[c-1]) gestures = labelled_gestures + [column_file_map[c - 1]] adjacency_graph = similarity_matrix_df.loc[:, ["Nothing"] + gestures] adjacency_graph = adjacency_graph.loc[adjacency_graph["Nothing"].isin( gestures)] adjacency_graph = np.array(adjacency_graph)[:, 1:] adjacency_graph = adjacency_graph.astype(dtype=float) # TODO: Do we have to consider only k most closest here? adjacency_graph = adjacency_graph * (adjacency_graph >= np.sort( adjacency_graph, axis=1)[:, [-k]]) normalized_adjacency_graph = sklearn.preprocessing.normalize( adjacency_graph, norm='l1', axis=0) vector_size = len(adjacency_graph) restart_vector = np.zeros((vector_size, 1)) restart_vector[vector_size - 1][0] = 1 ppr_vector = ppr(normalized_adjacency_graph, restart_vector) matrix_file_names = [column_file_map[k - 1] for k in matrix_columns] dominant_file_names = sorted(zip(ppr_vector, matrix_file_names), key=lambda v: v[0], reverse=True)[:10] dominant_features_class = [] for ranking, filename in dominant_file_names: if filename != column_file_map[c - 1]: dominant_features_class.append(labels_train_dict[filename]) class_label = max(set(dominant_features_class), key=dominant_features_class.count) csv_write.writerow( (unlabelled_gestures[c_index].replace("_words.csv", ""), class_label))