def __init__(self, config_file=CONFIG_FILE): self.X = None self.config = ConfigParser.RawConfigParser() self.config.read(config_file) # self.total_query = self.config.getint('DBpedia','TotalQuery') # self.query_file = self.config.get('DBpedia','QueryFile') self.dbp_prefix_file = self.config.get("DBpedia", "Namespaces") # self.f_extractor = FeatureExtractor() self.sp_util = SarqlUtil(self.dbp_prefix_file) self.queries = list() self.distance_hungarian_script = self.config.get("QueryClustering", "DistanceHungarian") self.K = self.config.getint("QueryClustering", "K") self.random_shuffel_max_iters = self.config.getint("QueryClustering", "RandomShuffelMaxIters") self.kmediods_max_iters = self.config.getint("QueryClustering", "KmediodsMaxIters") self.cluster_cach_file = self.config.get("QueryClustering", "HungarianClusterCach") self.center_cach_file = self.config.get("QueryClustering", "HungarianCenterCach") self.training_query_file = self.config.get("Query", "TrainingQuery") self.training_algebra_feature_file = self.config.get("Query", "TrainingAlgebraFeatures") self.training_execution_times_file = self.config.get("Query", "TrainingQueryExecutionTimes") self.validation_query_file = self.config.get("Query", "ValidationQuery") self.test_query_file = self.config.get("Query", "TestQuery") self.distance_matrix_file = self.config.get("QueryClustering", "TrainingDistanceHungarianMatrix") self.center_idxs = None self.idx = None
class ClusterSparql: """Cluster sparql""" def __init__(self, config_file=CONFIG_FILE): self.X = None self.config = ConfigParser.RawConfigParser() self.config.read(config_file) # self.total_query = self.config.getint('DBpedia','TotalQuery') # self.query_file = self.config.get('DBpedia','QueryFile') self.dbp_prefix_file = self.config.get("DBpedia", "Namespaces") # self.f_extractor = FeatureExtractor() self.sp_util = SarqlUtil(self.dbp_prefix_file) self.queries = list() self.distance_hungarian_script = self.config.get("QueryClustering", "DistanceHungarian") self.K = self.config.getint("QueryClustering", "K") self.random_shuffel_max_iters = self.config.getint("QueryClustering", "RandomShuffelMaxIters") self.kmediods_max_iters = self.config.getint("QueryClustering", "KmediodsMaxIters") self.cluster_cach_file = self.config.get("QueryClustering", "HungarianClusterCach") self.center_cach_file = self.config.get("QueryClustering", "HungarianCenterCach") self.training_query_file = self.config.get("Query", "TrainingQuery") self.training_algebra_feature_file = self.config.get("Query", "TrainingAlgebraFeatures") self.training_execution_times_file = self.config.get("Query", "TrainingQueryExecutionTimes") self.validation_query_file = self.config.get("Query", "ValidationQuery") self.test_query_file = self.config.get("Query", "TestQuery") self.distance_matrix_file = self.config.get("QueryClustering", "TrainingDistanceHungarianMatrix") self.center_idxs = None self.idx = None # print self.total_query # print self.query_file, type(self.query_file) def load_queries_dbp_log(self): f = open(self.query_file, "rb") count = 0 for line in f: # print line if count >= self.total_query: break try: sparql_query = self.sp_util.dbp_log_to_sparql(line) # sparql_query = self.sp_util.get_dbp_sparql(sparql_query) # print sparql_query count += 1 self.queries.append(sparql_query) except: pass self.X = np.array(self.queries).transpose() def load_training_queries(self, limit=None): # if limit == None: # limit = int(self.total_query*0.6) print "loading training queries:", self.training_query_file f = open(self.training_query_file, "rb") count = 0 for line in f: # print line # if count >= limit: # break try: # sparql_query = self.sp_util.url_to_sparql(line) # for the dbpsb queries, had to add /sparql/? to make it valid for url parsing sparql_query = self.sp_util.url_to_sparql("/sparql/?" + line) # print sparql_query count += 1 self.queries.append(sparql_query) except Exception as inst: print "Exception", inst self.X = np.array(self.queries).transpose() def distance_hungarian(self, q1, q2): tmp_q1_file = "tmp_q1_file~" # print query_str dbp_query1 = self.sp_util.get_dbp_sparql(q1) tq1 = open(tmp_q1_file, "w") tq1.write(dbp_query1) tq1.close() tmp_q2_file = "tmp_q2_file~" # print query_str dbp_query2 = self.sp_util.get_dbp_sparql(q2) tq2 = open(tmp_q2_file, "w") tq2.write(dbp_query2) tq2.close() cmd = self.distance_hungarian_script + " --file" + " " + tmp_q1_file + " " + tmp_q2_file (status, abs_query_str) = commands.getstatusoutput(cmd) # print abs_query_str if status != 0: # print "ged error", (status,abs_query_str) raise Exception("GED error status: " + str(status) + " " + abs_query_str) # print abs_query_str return float(abs_query_str) def compute_distance_matrix_real_time(self, distance_function=distance_hungarian): self.distance_matrix = k_mediods.compute_symmetric_distance(self.X, distance_function) distance_filename = self.distance_cach_filename(distance_function) np.save(distance_filename, self.distance_matrix) def compute_distance_matrix_from_cach(self, distance_function=distance_hungarian): distance_filename = self.distance_cach_filename(distance_function) self.distance_matrix = np.load(distance_filename + ".npy") def distance_cach_filename(self, distance_function): file_name = "distance_matrix" file_name = self.distance_function_name(distance_function) + "_hungarian" file_name = file_name + "_cach" return file_name def distance_function_name(self, distance_function): """ MODIFY THIS WHEN NEW distance_function is added """ if distance_function == self.distance_hungarian: return "_hungarian" return "" def save_clusters(self, distance_function): # df_name = self.distance_function_name(distance_function) # np.save(self.cluster_cach_file,self.idx) # np.save(self.center_cach_file,self.center_idxs) np.savetxt(self.cluster_cach_file, self.idx, fmt="%d") np.savetxt(self.center_cach_file, self.center_idxs, fmt="%d") def predict_cluster(self, Xi, distance_function, url_to_sparql=False): if url_to_sparql == True: Xi = self.sp_util.url_to_sparql(Xi) # if self.idx == None: # self.idx = np.load(elf.cluster_cach_file+'.npy') if self.center_idxs == None: self.center_idxs = np.load(self.center_cach_file + df_name + ".npy") min_dist = np.inf min_k = -1 for k in self.center_idxs: k_Xi = self.X[k] # print k_Xi d = distance_function(Xi, k_Xi) if min_dist > d: min_dist = d min_k = k # print "original:", Xi # print "prediction:",self.X[min_k] return min_k def cluster_queries(self, distance_function): # (min_center_idxs,min_cost) = k_mediods.initial_random_centers_cost_minimization(self.X ,self.K,self.distance_matrix,self.random_shuffel_max_iters,self.kmediods_max_iters) # print "min model cost: ", min_cost (initial_centers, min_center_idxs) = k_mediods.initial_random_centers(self.X, self.K) (self.center_idxs, self.idx) = k_mediods.k_mediods( self.X, min_center_idxs, self.kmediods_max_iters, self.distance_matrix ) # k_mediods.print_clusters(self.X, self.idx, self.center_idxs) total_cost = k_mediods.model_cost(self.X, self.idx, self.center_idxs, self.distance_matrix) print "model cost: ", total_cost self.save_clusters(distance_function) def load_distaince_hungarian_matrix(self): """ Must be called after loading training queries """ m = np.size(self.X, 0) print "m:", m self.distance_matrix = np.zeros((m, m), dtype=float) f = open(self.distance_matrix_file) for line in f: row = line.split() i = int(row[0]) j = int(row[1]) d = float(row[2]) # print i,j,d self.distance_matrix[i, j] = d self.distance_matrix[j, i] = d