def avg_contexts(self, ref_subvec, top, top_percent, top_inferences_number, exclude_ref, weights_factor): ''' Performs a weighted average of :param ref_subvec: given subvec as a numpy matrix :param top: :param top_percent: :param top_inferences_number: :param exclude_ref: :param weights_factor: :returns: parvec, number of contexts averaged ''' if len(self.contexts) == 0: return None, 0 ref_weight = 1 if exclude_ref == False else 0 if (top > len(self.contexts) + ref_weight): top = len(self.contexts) + ref_weight if (top > 0 or top_percent > 0): top_contexts_weights = self.sim_scores.todok() final_top = top-ref_weight # -1 to leave 1 for the ref_subvec num_top_percent = int(math.ceil(top_percent * (len(self.contexts)+ref_weight)))-ref_weight final_top = max(final_top, num_top_percent) cw_sorted = heapq.nlargest(final_top, top_contexts_weights.iteritems(), key=lambda x: x[1]) top_contexts_weights = dok_matrix((len(self.contexts),1), dtype=np.float32) for (k,j), weight in cw_sorted: top_contexts_weights[k,j] = weight**weights_factor top_contexts_weights = top_contexts_weights.tocsr() contexts_num = len(cw_sorted) else: contexts_num = len(self.contexts) if weights_factor == 0.0: top_contexts_weights = dok_matrix([[1.0]*contexts_num]).tocsr().transpose() else: top_contexts_weights = self.sim_scores.copy() top_contexts_weights.data **= weights_factor sum_weights = top_contexts_weights.sum() + ref_weight #weight +1 reserved for ref_subvec top_contexts_weights.data /= sum_weights weighted_subs_matrix = self.subs_matrix.multiply(top_contexts_weights) #NOT SUPPORTED IN SCIPY 0.7 avg_subvec = weighted_subs_matrix.sum(axis=0) if (exclude_ref == False) and (ref_subvec != None): ref_subvec.data *= 1.0/sum_weights avg_subvec = avg_subvec + ref_weight * ref_subvec.transpose() result_vec = self.__vec_to_sorted_list(avg_subvec, top_inferences_number) return result_vec, contexts_num
def _compute_relations(dictionary): # print("Computing relations", file=sys.stderr) # logger.log(logging.DEBUG, "Computing tables relations") # logger.log(logging.DEBUG, "Computing contains/contained relations") # logger.log(logging.DEBUG, "Computing father/child relations") # print("Computing siblings relations", sys.stderr) relations = {} contains = RelationsGraph._compute_contains(dictionary) relations['contains'] = csr_matrix(contains) relations['contained'] = csr_matrix(relations['contains'].transpose()) father = RelationsGraph._compute_father(dictionary) for i, r in enumerate(['_substance', '_attribute', '_mode']): relations['father' + r] = dok_matrix(father[i]) siblings = RelationsGraph._compute_siblings(dictionary) relations['opposed'] = dok_matrix(siblings[0]) relations['associated'] = dok_matrix(siblings[1]) relations['crossed'] = dok_matrix(siblings[2]) relations['twin'] = dok_matrix(siblings[3]) # self._do_inhibitions() for i, r in enumerate(['_substance', '_attribute', '_mode']): relations['child' + r] = relations['father' + r].transpose() # self.relations['siblings'] = sum(siblings) # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1) # self.relations['father'] = self.relations['father_substance'] + \ # self.relations['father_attribute'] + \ # self.relations['father_mode'] # self.relations['child'] = self.relations['child_substance'] + \ # self.relations['child_attribute'] + \ # self.relations['child_mode'] # self.relations['etymology'] = self.relations['father'] + self.relations['child'] table = RelationsGraph._compute_table_rank(dictionary, relations['contained']) for i in range(6): relations['table_%d' % i] = table[i] relations['identity'] = csr_matrix(np.eye(len(dictionary))) missing = {s for s in RELATIONS if s not in relations} if missing: raise ValueError("Missing relations : {%s}" % ", ".join(missing)) return { reltype: csr_matrix(relations[reltype]) for reltype in RELATIONS }
def _compute_relations(self): logger.log(logging.INFO, "Computing relations") self.relations = {} contains = self._compute_contains() self.relations['contains'] = csr_matrix(contains) self.relations['contained'] = csr_matrix( self.relations['contains'].transpose()) father = self._compute_father() for i, r in enumerate(['_substance', '_attribute', '_mode']): self.relations['father' + r] = dok_matrix(father[i]) siblings = self._compute_siblings() self.relations['opposed'] = dok_matrix(siblings[0]) self.relations['associated'] = dok_matrix(siblings[1]) self.relations['crossed'] = dok_matrix(siblings[2]) self.relations['twin'] = dok_matrix(siblings[3]) # self._do_inhibitions() for i, r in enumerate(['_substance', '_attribute', '_mode']): self.relations['child' + r] = self.relations['father' + r].transpose() # self.relations['siblings'] = sum(siblings) # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1) # self.relations['father'] = self.relations['father_substance'] + \ # self.relations['father_attribute'] + \ # self.relations['father_mode'] # self.relations['child'] = self.relations['child_substance'] + \ # self.relations['child_attribute'] + \ # self.relations['child_mode'] # self.relations['etymology'] = self.relations['father'] + self.relations['child'] table = self._compute_table_rank(self.relations['contained']) for i in range(6): self.relations['table_%d' % i] = table[i] self.relations['identity'] = csr_matrix(np.eye(len(self.dictionary))) missing = {s for s in RELATIONS if s not in self.relations} if missing: raise ValueError("Missing relations : {%s}" % ", ".join(missing)) self.relations = { reltype: csr_matrix(self.relations[reltype]) for reltype in RELATIONS }
def read_mm_matrix(file): """Read a MatrixMarket format file to a matrix object :param file: The file path """ with open(file) as f: first = True second = False for line in f: # Skip the first line if first: first = False second = True continue # The header is in the second line elif second: tokens = line.strip().split() dim_x, dim_y = int(tokens[0]), int(tokens[1]) m = dok_matrix((dim_x, dim_y), dtype=np.int16) second = False continue # The rest of the lines are the data x, y, v = [int(t) for t in line.strip().split()] m[x-1, y-1] = v return m.tocsr()
def bitmap_to_graph(img, mask): # graph = csr_matrix(img) am_ind = unravel_index(img.argmax(), img.shape) am = img[am_ind] # A sparse adjacency matrix. # Two pixels are adjacent in the graph if both are painted. adjacency = dok_matrix( (img.shape[0] * img.shape[1], img.shape[0] * img.shape[1])) # The following lines fills the adjacency matrix by directions = list(itertools.product([0, 1, -1], [0, 1, -1])) for i in range(1, img.shape[0] - 1): for j in range(1, img.shape[1] - 1): pix1 = img[i, j] if not mask[i, j]: continue for y_diff, x_diff in directions: pix2 = img[i + y_diff, j + x_diff] if not mask[i + y_diff, j + x_diff]: continue adjacency[to_index(img, i, j), to_index(img, i + y_diff, j + x_diff)] = float( am * 2 - pix1 - pix2)**16 + 1 #abs(int(pix2) - int(pix1))*2 + 1 return adjacency
def __init__(self, resource_mat_file, entity_map_file, property_map_file, \ relations_file, whitelist): """ Load the resource with the restricted set of edge types according to the whitelist :param whitelist: The list of allowed edge types """ # Load the properties prop_to_id, id_to_prop = load_map(property_map_file, None) # Filter according to the whitelist properties_in_whitelist = set([clean(prop) for prop in whitelist]) id_to_prop = dict([(prop_to_id[prop], prop) for prop in prop_to_id.keys() if prop in properties_in_whitelist]) prop_to_id = dict([(prop, prop_to_id[prop]) for prop in prop_to_id.keys() if prop in properties_in_whitelist]) self.prop_to_id, self.id_to_prop = prop_to_id, id_to_prop edge_types = [edge_type.replace('$', '').replace('^', '') for edge_type in whitelist] self.allow_reversed_edges = len([prop for prop in self.prop_to_id.keys() if '<-' + prop + '-' in edge_types]) > 0 # Load the edges for the specific properties self.l2r_edges, self.r2l_edges = load_edges(relations_file, None, prop_to_id.values()) # Load the entities self.term_to_id, self.id_to_term = load_map(entity_map_file, None) # Load the restricted matrix m = dok_matrix((len(self.term_to_id), len(self.term_to_id)), dtype=np.int16) for x in self.l2r_edges.keys(): for y in self.l2r_edges[x].keys(): m[x, y] = 1 self.adjacency_matrix = m.tocsr() if self.allow_reversed_edges: self.adjacency_matrix = self.adjacency_matrix + self.adjacency_matrix.T
def read_mm_matrix(file): """Read a MatrixMarket format file to a matrix object :param file: The file path """ with open(file) as f: first = True second = False for line in f: # Skip the first line if first: first = False second = True continue # The header is in the second line elif second: tokens = line.strip().split() dim_x, dim_y = int(tokens[0]), int(tokens[1]) m = dok_matrix((dim_x, dim_y), dtype=np.int16) second = False continue # The rest of the lines are the data x, y, v = [int(t) for t in line.strip().split()] m[x - 1, y - 1] = v return m.tocsr()
def _compute_relations(self): logger.log(logging.INFO, "Computing relations") self.relations = {} contains = self._compute_contains() self.relations['contains'] = csr_matrix(contains) self.relations['contained'] = csr_matrix(self.relations['contains'].transpose()) father = self._compute_father() for i, r in enumerate(['_substance', '_attribute', '_mode']): self.relations['father' + r] = dok_matrix(father[i]) siblings = self._compute_siblings() self.relations['opposed'] = dok_matrix(siblings[0]) self.relations['associated'] = dok_matrix(siblings[1]) self.relations['crossed'] = dok_matrix(siblings[2]) self.relations['twin'] = dok_matrix(siblings[3]) # self._do_inhibitions() for i, r in enumerate(['_substance', '_attribute', '_mode']): self.relations['child' + r] = self.relations['father' + r].transpose() # self.relations['siblings'] = sum(siblings) # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1) # self.relations['father'] = self.relations['father_substance'] + \ # self.relations['father_attribute'] + \ # self.relations['father_mode'] # self.relations['child'] = self.relations['child_substance'] + \ # self.relations['child_attribute'] + \ # self.relations['child_mode'] # self.relations['etymology'] = self.relations['father'] + self.relations['child'] table = self._compute_table_rank(self.relations['contained']) for i in range(6): self.relations['table_%d'%i] = table[i] self.relations['identity'] = csr_matrix(np.eye(len(self.dictionary))) missing = {s for s in RELATIONS if s not in self.relations} if missing: raise ValueError("Missing relations : {%s}"%", ".join(missing)) self.relations = {reltype: csr_matrix(self.relations[reltype]) for reltype in RELATIONS}
def __init__(self, args, i2w, w2i, subvecs_num, w2counts, sum_word_counts, stopwords, embeddings): self.args = args self.w2i = w2i self.i2w = i2w self.w2counts = w2counts self.sum_word_counts = sum_word_counts self.stopwords = stopwords self.contexts = [] self.sim_scores = None # points either to self.subvecs_sim_scores or to self.bow_sim_scores initial_sim_score = 1.0 if subvecs_num==0 else 1.0/subvecs_num self.embeddings = embeddings # when this is not None the bow representation is dense (todo: refactor this code) self.bow_size = args.bow_size if (self.bow_size >= 0): if (self.embeddings != None): bow_dimensionality = self.embeddings.dimension() self.bow_matrix = np.zeros((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep self.bow_L2_norms = None # we always keep them normalized self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose() else: bow_dimensionality = len(w2i) self.bow_matrix = dok_matrix((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep self.bow_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32) self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose() self.subs_matrix = dok_matrix((subvecs_num, len(w2i)), dtype=np.float32) #used for sim weights calculation, also for sub average only if no dual matrix self.subvecs_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32) self.subvecs_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose() self.target_counts = {}
def gen_random(nodes, k): building_matrix = dok_matrix((nodes, nodes)) for node in xrange(nodes): k_ns = set([node]) while node in k_ns or len(k_ns) < 3: k_ns = set(list(random_integers(0, nodes - 1, k))) for index in k_ns: building_matrix[node, index] = 1 return building_matrix.tocsr()
def create_one_hot_vector(x, dim): """Creates the one-hot vector representing this node :param x -- the node :param dim -- the number of nodes (the adjacency matrix dimension) """ n_x = dok_matrix((1, dim), dtype=np.int16) n_x[0, x] = 1 n_x = n_x.tocsr() return n_x
def __init__(self, total_feature_count, version_count, feature_list, target_id, start, end, ngram_sizes=None, ngram_levels=None, label="", sparse=False, dok=False): """" Initialize an empty dataset. A dataset consists of two components: - The data attribute is a matrix containing all input data. It's size is version_count x feature_count. Each row of the data matrix represents the feature vector of one version. - The target attribute is a vector containing the ground truth. It's size is version_count. Args: total_feature_count (int): Amount of versions (and ngrams). Equals the rows of the data and target matrix. feature_list (List[str]): A list of Feature IDs. Must be in the same order as they are in the dataset. target_id (str): ID of the target which is used in this dataset. E.g. 'month' start (datetime): Start of the date range contained in this dataset. end (datetime): End of the date range contained in this dataset. ngram_sizes (list[int]): Optional. The ngram-sizes in this dataset (e.g. [1, 2] for 1-grams and 2-grams) ngram_levels (list[int]): Optional. The ngram-levels in this dataset. label (str): An arbitrary label, e.g. "Test", for this dataset. Useful when caching! sparse (bool): If the data and target matrices should be sparse. Recommended in combination with ngrams. dok (bool): If a dok-type sparse matrix should be used. Dok is faster to update. Can be converted to CSR. """ ngram_count = 0 if ngram_sizes and ngram_levels: ngram_count = len(ngram_sizes) * len(ngram_levels) logging.debug( "Initializing Dataset with %i versions, %i features and %i ngram vectors." % (version_count, total_feature_count, ngram_count)) dimension = (version_count, total_feature_count + ngram_count) if sparse: if dok: self.data = dok_matrix(dimension, dtype=np.float64) else: self.data = csr_matrix(dimension, dtype=np.float64) else: self.data = np.zeros(dimension) self.target = np.zeros(version_count) self.feature_list = feature_list self.target_id = target_id self.start = start self.end = end self.ngram_sizes = ngram_sizes self.ngram_levels = ngram_levels self.label = label self.sparse = sparse
def reference_context(self, subvec, context, bow_interpolate): ''' Weighs contexts in this collection according to similarity to the given reference context :param subvec: subvec representation of given context :param context: given context :param bow_interpolate: interpolation factor (between bow and subvec simiarity) :returns: subvec as a numpy matrix ''' subvec_matrix = dok_matrix((len(self.w2i),1), dtype=np.float32) for word, weight in subvec: subvec_matrix[self.w2i[word],0] = weight subvec_matrix = subvec_matrix.tocsr() return self.__reference_context_imp(subvec_matrix, context, bow_interpolate)
def __init__(self, pset, h = None, alpha = None): if h is None: self.__h = 0.012 # For liquid water=0.012 m, incompressible flow, Alejandro Jacobo Cabrera Crespo (2008) else: self.__h = h if alpha is None: self.__alpha = 0.5 # For liquid water, incompressible flow, Alejandro Jacobo Cabrera Crespo (2008) else: self.__alpha = alpha self.__r = dok.dok_matrix((pset.size, pset.size), dtype=np.float64) # List of distances between particles i and j
def reference_context(self, subvec, context, bow_interpolate): ''' Weighs contexts in this collection according to similarity to the given reference context :param subvec: subvec representation of given context :param context: given context :param bow_interpolate: interpolation factor (between bow and subvec simiarity) :returns: subvec as a numpy matrix ''' subvec_matrix = dok_matrix((len(self.w2i), 1), dtype=np.float32) for word, weight in subvec: subvec_matrix[self.w2i[word], 0] = weight subvec_matrix = subvec_matrix.tocsr() return self.__reference_context_imp(subvec_matrix, context, bow_interpolate)
def map_kernel (self, fn_kernel, pset): ''' r is the distance between particles 'i' and 'j' ''' kernel = dok.dok_matrix((pset.size, pset.size), dtype=np.float64) items = self.__r.items() for item in items: conn = [self.__INI_INT, self.__INI_INT] r = self.__INI_FLOAT conn = item[0] r = item[1] kernel [conn[0], conn[1]] = fn_kernel(r=r) return kernel
def main(original_img,pathimage,x1,x2,y1,y2): img = original_img[:, :, 0] + original_img[:, :, 1] + original_img[:, :, 2] adjacency = dok_matrix((img.shape[0] * img.shape[1], img.shape[0] * img.shape[1]), dtype=bool) # The following lines fills the adjacency matrix by directions = list(itertools.product([0, 1, -1], [0, 1, -1])) for i in range(1, img.shape[0] - 1): for j in range(1, img.shape[1] - 1): if not img[i, j]: continue for y_diff, x_diff in directions: if img[i + y_diff, j + x_diff]: adjacency[to_index(img,i, j), to_index(img,i + y_diff, j + x_diff)] = True # We chose two arbitrary points, which we know are connected source = to_index(img,y1, x1) target = to_index(img,y2, x2) # Compute the shortest path between the source and all other points in the image _, predecessors = dijkstra(adjacency, directed=False, indices=[source], unweighted=True, return_predecessors=True) # Constructs the path between source and target pixel_index = target pixels_path = [] while pixel_index != source: pixels_path.append(pixel_index) pixel_index = predecessors[0, pixel_index] if(pixel_index==-9999): return 1 # The following code is just for debugging and it visualizes the chosen path #original_img.setflags(write=1) path=[] for pixel_index in pixels_path: i, j = to_coordinates(img,pixel_index) print(i,j) path.append([i,j]) pathimage[i, j,0] = 255 plt.imshow(pathimage) plt.show() return path
def build_extra_features(noncat_matrix): X = dok_matrix((noncat_matrix.shape[0], noncat_matrix.shape[1] * 2)) xs, ys = noncat_matrix.nonzero() print(len(xs), "nonzero elems") count = 0 for x, y in zip(xs, ys): count += 1 if count % 1000 == 0: print(count) val = noncat_matrix[x, y] if val - math.floor(val) != 0.0: for i in range(20): if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001: X[x, 2 * y] = math.ceil(abs(val) * i) X[x, 2 * y + 1] = i return X
def __init__(self, map): """ init shortest paths input: map - a 2d numpy array representing all reachable areas of the map. (probably just the reachability map) """ self.map = map # init the adjacency matrix self.adjacency = dok_matrix((map.shape[0] * map.shape[1], map.shape[0] * map.shape[1]), dtype=bool) # fill the adjacency matrix self._fill_adjacency()
def make_adjacency_matrix(img): rows, cols = img.shape adjacency = dok_matrix((rows * cols, rows * cols), dtype=bool) directions = list(itertools.product([0, 1, -1], [0, 1, -1])) for row in range(1, rows - 1): for col in range(1, cols - 1): if not img[row, col]: continue for y_diff, x_diff in directions: if img[row + y_diff, col + x_diff]: adjacency[to_index(cols, row, col), to_index(cols, row + y_diff, col + x_diff)] = True return adjacency
def __init__(self, resource_mat_file, entity_map_file, property_map_file, \ relations_file, whitelist): """ Load the resource with the restricted set of edge types according to the whitelist :param whitelist: The list of allowed edge types """ # Load the properties prop_to_id, id_to_prop = load_map(property_map_file, None) # Filter according to the whitelist properties_in_whitelist = set([clean(prop) for prop in whitelist]) id_to_prop = dict([(prop_to_id[prop], prop) for prop in prop_to_id.keys() if prop in properties_in_whitelist]) prop_to_id = dict([(prop, prop_to_id[prop]) for prop in prop_to_id.keys() if prop in properties_in_whitelist]) self.prop_to_id, self.id_to_prop = prop_to_id, id_to_prop edge_types = [ edge_type.replace('$', '').replace('^', '') for edge_type in whitelist ] self.allow_reversed_edges = len([ prop for prop in self.prop_to_id.keys() if '<-' + prop + '-' in edge_types ]) > 0 # Load the edges for the specific properties self.l2r_edges, self.r2l_edges = load_edges(relations_file, None, prop_to_id.values()) # Load the entities self.term_to_id, self.id_to_term = load_map(entity_map_file, None) # Load the restricted matrix m = dok_matrix((len(self.term_to_id), len(self.term_to_id)), dtype=np.int16) for x in self.l2r_edges.keys(): for y in self.l2r_edges[x].keys(): m[x, y] = 1 self.adjacency_matrix = m.tocsr() if self.allow_reversed_edges: self.adjacency_matrix = self.adjacency_matrix + self.adjacency_matrix.T
def __init__( self , size , dim , m=np.array([]) , Consts=1.0 , f_inter=None ): super( LinearSpringConstrained , self ).__init__( size , dim , m , Consts , f_inter=f_inter ) self.__dim = dim self.__size = size self.__K = Consts self.__A = np.zeros( ( size , dim ) ) self.__F = np.zeros( ( size , dim ) ) self.__Fm = dok.dok_matrix( ( size , size ) ) self.__Fm2 = csr.csr_matrix( ( size , size ) ) self.__M = np.zeros( ( size , 1 ) ) if len(m) != 0 : self.set_masses( m )
def scipy_sparse_matrix_from_dict(neighbors): """ Parameters ---------- neighbors : dict Each key represents an area. The corresponding value contains the area's neighbors. Returns ------- adj : :class:`scipy.sparse.csr_matrix` Adjacency matrix representing the areas' contiguity relation. Examples -------- >>> neighbors = {0: {1, 3}, 1: {0, 2, 4}, 2: {1, 5}, ... 3: {0, 4}, 4: {1, 3, 5}, 5: {2, 4}} >>> obtained = scipy_sparse_matrix_from_dict(neighbors) >>> desired = np.array([[0, 1, 0, 1, 0, 0], ... [1, 0, 1, 0, 1, 0], ... [0, 1, 0, 0, 0, 1], ... [1, 0, 0, 0, 1, 0], ... [0, 1, 0, 1, 0, 1], ... [0, 0, 1, 0, 1, 0]]) >>> (obtained.todense() == desired).all() True >>> neighbors = {"left": {"middle"}, ... "middle": {"left", "right"}, ... "right": {"middle"}} >>> obtained = scipy_sparse_matrix_from_dict(neighbors) >>> desired = np.array([[0, 1, 0], ... [1, 0, 1], ... [0, 1, 0]]) >>> (obtained.todense() == desired).all() True """ n_areas = len(neighbors) name_to_int = { area_name: i for i, area_name in enumerate(sorted(neighbors)) } adj = dok_matrix((n_areas, n_areas)) for i in neighbors: for j in neighbors[i]: adj[name_to_int[i], name_to_int[j]] = 1 return adj.tocsr()
def scipy_sparse_matrix_from_dict(neighbors): """ Parameters ---------- neighbors : dict Each key represents an area. The corresponding value contains the area's neighbors. Returns ------- adj : :class:`scipy.sparse.csr_matrix` Adjacency matrix representing the areas' contiguity relation. Examples -------- >>> neighbors = {0: {1, 3}, 1: {0, 2, 4}, 2: {1, 5}, ... 3: {0, 4}, 4: {1, 3, 5}, 5: {2, 4}} >>> obtained = scipy_sparse_matrix_from_dict(neighbors) >>> desired = np.array([[0, 1, 0, 1, 0, 0], ... [1, 0, 1, 0, 1, 0], ... [0, 1, 0, 0, 0, 1], ... [1, 0, 0, 0, 1, 0], ... [0, 1, 0, 1, 0, 1], ... [0, 0, 1, 0, 1, 0]]) >>> (obtained.todense() == desired).all() True >>> neighbors = {"left": {"middle"}, ... "middle": {"left", "right"}, ... "right": {"middle"}} >>> obtained = scipy_sparse_matrix_from_dict(neighbors) >>> desired = np.array([[0, 1, 0], ... [1, 0, 1], ... [0, 1, 0]]) >>> (obtained.todense() == desired).all() True """ n_areas = len(neighbors) name_to_int = {area_name: i for i, area_name in enumerate(sorted(neighbors))} adj = dok_matrix((n_areas, n_areas)) for i in neighbors: for j in neighbors[i]: adj[name_to_int[i], name_to_int[j]] = 1 return adj.tocsr()
def __init__(self, args, i2w, w2i, subvecs_num, w2counts, sum_word_counts, stopwords, embeddings): self.args = args self.w2i = w2i self.i2w = i2w self.w2counts = w2counts self.sum_word_counts = sum_word_counts self.stopwords = stopwords self.contexts = [] self.sim_scores = None # points either to self.subvecs_sim_scores or to self.bow_sim_scores initial_sim_score = 1.0 if subvecs_num == 0 else 1.0 / subvecs_num self.embeddings = embeddings # when this is not None the bow representation is dense (todo: refactor this code) self.bow_size = args.bow_size if (self.bow_size >= 0): if (self.embeddings != None): bow_dimensionality = self.embeddings.dimension() self.bow_matrix = np.zeros( (subvecs_num, bow_dimensionality), dtype=np.float32 ) # estimate sim of contexts based on their BOW rep self.bow_L2_norms = None # we always keep them normalized self.bow_sim_scores = dok_matrix( [[initial_sim_score] * subvecs_num]).tocsr().transpose() else: bow_dimensionality = len(w2i) self.bow_matrix = dok_matrix( (subvecs_num, bow_dimensionality), dtype=np.float32 ) # estimate sim of contexts based on their BOW rep self.bow_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32) self.bow_sim_scores = dok_matrix( [[initial_sim_score] * subvecs_num]).tocsr().transpose() self.subs_matrix = dok_matrix( (subvecs_num, len(w2i)), dtype=np.float32 ) #used for sim weights calculation, also for sub average only if no dual matrix self.subvecs_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32) self.subvecs_sim_scores = dok_matrix([[initial_sim_score] * subvecs_num ]).tocsr().transpose() self.target_counts = {}
def read_matrix(path): """Read a MatrixMarket format file to a matrix object""" with open(path) as f: first = True second = False for line in f: if first: first = False second = True continue elif second: tokens = line.strip().split() dim_x, dim_y = int(tokens[0]), int(tokens[1]) m = dok_matrix((dim_x, dim_y), dtype=np.int16) second = False continue x, y, v = [int(t) for t in line.strip().split()] m[x-1, y-1] = v return m.tocsr()
def read_matrix(path): """Read a MatrixMarket format file to a matrix object""" with open(path) as f: first = True second = False for line in f: if first: first = False second = True continue elif second: tokens = line.strip().split() dim_x, dim_y = int(tokens[0]), int(tokens[1]) m = dok_matrix((dim_x, dim_y), dtype=np.int16) second = False continue x, y, v = [int(t) for t in line.strip().split()] m[x - 1, y - 1] = v return m.tocsr()
def __init__(self, size, dim, m=np.array([]), Consts=1.0, f_inter=None): super(LinearSpringConstrained, self).__init__(size, dim, m, Consts, f_inter=f_inter) self.__dim = dim self.__size = size self.__K = Consts self.__A = np.zeros((size, dim)) self.__F = np.zeros((size, dim)) self.__Fm = dok.dok_matrix((size, size)) self.__Fm2 = csr.csr_matrix((size, size)) self.__M = np.zeros((size, 1)) if m is not None and len(m) != 0: self.set_masses(m)
def cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter, min_avg_cluster_size, subvec_filename, cluster_filename): ''' kmeans clustering of subvecs given in an input file :param w2i: word2index :param cluster_prunning: max size of a cluster centroid :param K: number of clusters :param ninit: number of repeating tries :param maxiter: number of clustering iterations :param min_avg_cluster_size: min size of clusters (on average) :param subvec_filename: input filename :param cluster_filename: output filename :returns: None ''' if os.path.exists(cluster_filename): print "NOTICE: cluster file %s already exists. skipping." % cluster_filename return subvec_file = open(subvec_filename, 'r') subvec_num = sum(1 for line in subvec_file)/2 #subvec is on every second line subvec_file.seek(0) minK = min(subvec_num/min_avg_cluster_size, K) minK = max(1, minK) cluster_file = open(cluster_filename, 'w') print "Clustering subvecs in file %s. Using K=%d\n" % (cluster_filename, minK) target = subvec_filename[subvec_filename.rfind('/')+1:] subs_matrix = dok_matrix((subvec_num, len(w2i)), dtype=np.float32) line = 0 try: while True: context_inst, subvec = read_context(subvec_file) normalize_subvec(subvec) for word, weight in subvec: if (weight != 0): subs_matrix[line, w2i[word]] = weight line += 1 if line % 10000 == 0: sys.stderr.write("Read %d subvecs\n" % (line)) except EOFError: sys.stderr.write("Finished loading %d context lines\n" % line) subs_matrix = subs_matrix.tocsr() best_centroids = None best_inertia = None for init_iter in xrange(0, ninit): kmeans = KMeans(init='k-means++', n_clusters=minK, n_init=1, max_iter=1) kmeans.fit(subs_matrix) centroids = kmeans.cluster_centers_ normalize_centroids(centroids) for iter in xrange(1,maxiter): kmeans = KMeans(init=centroids, n_clusters=minK, n_init=1, max_iter=1) kmeans.fit(subs_matrix) centroids = kmeans.cluster_centers_ normalize_centroids(centroids) inertia = kmeans.inertia_ if best_centroids is None or inertia < best_inertia: best_inertia = inertia best_centroids = centroids for j in xrange(0,len(best_centroids)): cluster_vec = [(i2w[i], weight) for (i, weight) in enumerate(best_centroids[j,:]) if weight != 0] cluster_vec = sorted(cluster_vec, key=itemgetter(1), reverse=True)[:cluster_prunning] norm = sum([weight**2 for word, weight in cluster_vec])**0.5 cluster_vec = [(word, weight/norm) for word, weight in cluster_vec] norm = sum([weight**2 for word, weight in cluster_vec])**0.5 cluster_file.write(target + "\t" + str(j) + "\t0\t" + target + "\tCLUSTER\t norm verified = " + '{0:1.8f}'.format(norm) + "\tpruning factor = " + str(cluster_prunning) +"\n") for (word, weight) in cluster_vec: cluster_file.write(' '.join([word, '{0:1.8f}'.format(weight)])+'\t') cluster_file.write('\n') subvec_file.close() cluster_file.close()
def findShortest(image, original_image): # Load the image from disk as a numpy ndarray #original_img = cv2.imread('d_test/24.tif') original_img = original_image # Create a flat color image for graph building: #img = cv2.imread('d_test/24.tif', 0) img = image # Defines a translation from 2 coordinates to a single number, y = height x = width def to_index(y, x): return y * img.shape[1] + x # Defines a reversed translation from index to 2 coordinates def to_coordinates(index): return index / img.shape[1], index % img.shape[1] # A sparse adjacency matrix. # Two pixels are adjacent in the graph if both are painted. adjacency = dok_matrix( (img.shape[0] * img.shape[1], img.shape[0] * img.shape[1]), dtype=np.uint8) #adjacency = image.img_to_graph(img) #adjacency = np.zeros((img.shape[0]*img.shape[1], img.shape[1]*img.shape[0])) # The following lines fills the adjacency matrix by directions = list(itertools.product([0, 1, -1], [0, 1, -1])) height, width, channels = original_img.shape #G2 = nx.complete_graph(height * width) #We create a graph the size of our image G2 = nx.DiGraph() for i in range(width * height): G2.add_node(i) #These loops create the nodes and the edges between them, the rules are: White pixel to white pixel: lowest cost #White to black and black to black: significantly higher cost, node to itself: highest cost to prevemt loops. for i in range(0, height): for j in range(0, width): for y_diff, x_diff in directions: if i + y_diff < 0 or i + y_diff > height - 1 or j + x_diff < 0 or j + x_diff > width - 1: continue if img[i + y_diff, j + x_diff]: if to_index(i, j) == to_index(i + y_diff, j + x_diff): #adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = 255 G2.add_edge(to_index(i, j), to_index(i + y_diff, j + x_diff), weight=255) else: #print("( {0} , {1} )".format(to_index(i, j), to_index(i + y_diff, j + x_diff))) #adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = True G2.add_edge(to_index(i, j), to_index(i + y_diff, j + x_diff), weight=1) else: #print("White to Black") #print(to_index(i, j), to_index(i + y_diff, j + x_diff)) #adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = 50 G2.add_edge(to_index(i, j), to_index(i + y_diff, j + x_diff), weight=10) #print(adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)]) """if i == j: #adjacency[to_index(i, j), to_index(i, j)] = 255 G2.add_edge(to_index(i, j), to_index(i + y_diff, j + x_diff), weight=255)""" # We chose two arbitrary points, which we know are connected source = to_index(1, int((width / 2))) target = to_index(height - 1, int((width / 2))) print(to_index(height - 1, int((width / 2) * 0.25))) #m = adjacency.todense() #G = nx.from_numpy_matrix(m, create_using=nx.DiGr aph(),parallel_edges=True) #G2 = nx.DiGraph(adjacency) #G2 = nx.from_scipy_sparse_matrix(adjacency, create_using=nx.MultiDiGraph) """for n, nbrsdict in G.adjacency_iter(): for nbr,eattr in nbrsdict.items(): if 'weight' in eattr: (n, nbr, eattr['weight'])""" #print(G2) #G2[0][0]['weight'] path = nx.shortest_path(G2, source, target, weight='weight') print(path) # Compute the shortest path between the source and all other points in the image """M, predecessors = dijkstra(m, directed=False, unweighted=False, return_predecessors=True) # indices = source, """ # Constructs the path between source and target pixel_index = int(target) pixels_path = [] #print(predecessors) #print(predecessors[pixel_index-1]) while pixel_index != source: try: pixels_path.append(pixel_index) pixel_index = path[pixel_index] except IndexError: print(pixel_index) break for i in path: pixels_path.append(i) pixels_path.append(target) pixels_path[0] = 0 # The following code is just for debugging and it visualizes the chosen path for pixel_index in pixels_path: try: i, j = to_coordinates(pixel_index) i = int(i) j = int(j) original_img[i, j, 0] = original_img[i, j, 1] = 5 except IndexError: break #cv2.imwrite("d_test/Final Test/img_with_path_weighted_directed_11.tif", original_img) #plt.imshow(original_img) #plt.show() return original_img
def __init__ ( self , pset=None ): self.__S = dok.dok_matrix( (1,1) , dtype=np.byte ) if pset != None : self.pset = pset
def allpairsmaxminpath(imgPassed): img2 = np.copy(imgPassed) img2 = np.uint8(img2) threshold = .5 # Detect edges using Canny canny_output = cv2.Canny(img2, threshold, threshold * 2) plt.imshow(canny_output) # Find contours _, contours, _ = cv2.findContours(canny_output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # Find the convex hull object for each contour hull_list = [] for i in range(len(contours)): hull = cv2.convexHull(contours[i]) hull_list.append(hull) # Draw contours + hull results drawing = np.zeros((canny_output.shape[0], canny_output.shape[1], 3), dtype=np.uint8) #for i in range(len(contours)): #color = (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256)) #cv2.drawContours(drawing, contours, i, color) #cv2.drawContours(drawing, hull_list, i, color) drawing = np.zeros((canny_output.shape[0], canny_output.shape[1]), dtype=np.uint8) #hull_list_copy = np.array(hull_list) #for j in range(0,hull_list_copy.shape[0]): # for i in range(0,hull_list_copy[j].shape[0]): # drawing[hull_list_copy[j][i][0][1]][hull_list_copy[j][i][0][0]] = 1 #plt.imshow(drawing) ## 18 pixels turned on, but tuple is only (2, 13, 1, 2) #np.where(drawing == 1)[0].shape #hull_list_copy.shape hull_list_copy = [] for i in range(0,len(hull_list)): for j in range(0,len(hull_list[i])): if not any(np.array_equal(hull_list[i][j], unique_arr) for unique_arr in hull_list_copy): hull_list_copy.append(hull_list[i][j]) #hull_list_copy.append(hull_list[i][j]) #np.array(hull_list_copy).shape hull_list_copy = np.array(hull_list_copy) hull_list_copy.shape img = np.uint8(np.copy(imgPassed)) kernel = np.ones((5, 5),np.uint8) dilation = cv2.dilate(np.uint8(img),kernel,iterations = 1) #plt.imshow(dilation) for j in range(0,hull_list_copy.shape[0]): dilation[hull_list_copy[j][0][1]][hull_list_copy[j][0][0]] = 3 #plt.imshow(dilation) img = np.copy(dilation) # A sparse adjacency matrix. # Two pixels are adjacent in the graph if both are painted. adjacency = dok_matrix((img.shape[0] * img.shape[1], img.shape[0] * img.shape[1]), dtype=bool) # The following lines fills the adjacency matrix by directions = list(itertools.product([0, 1, -1], [0, 1, -1])) for i in range(1, img.shape[0] - 1): for j in range(1, img.shape[1] - 1): if not img[i, j]: continue for y_diff, x_diff in directions: if img[i + y_diff, j + x_diff]: adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = True maxDist = 0 maxPath = [] #hull_list_copy[0][0].shape #hull_list_copy.shape ##so i think change this to a double for-loop for j in range(0,hull_list_copy.shape[0]): for k in range(j,hull_list_copy.shape[0]): #2 points we know are connected, hmmmmmmmmm source = to_index(hull_list_copy[j][0][1], hull_list_copy[j][0][0]) target = to_index(hull_list_copy[k][0][1], hull_list_copy[k][0][0]) #shortest path between the source and all other points in the image _, predecessors = dijkstra(adjacency, directed=False, indices=[source], unweighted=True, return_predecessors=True) predecessors[predecessors != -9999].shape #construct the path pixel_index = target pixels_path = [] while pixel_index != source: pixels_path.append(pixel_index) pixel_index = predecessors[0, pixel_index] #for pixel_index in pixels_path: # x, y = to_coordinates(pixel_index) #print(i, j) # img[x, y] = 2 #if this is now our longest shortest path, keep it if (len(pixels_path) > maxDist): maxDist = len(pixels_path) maxPath = np.copy(pixels_path) for pixel_index in maxPath: x, y = to_coordinates(pixel_index) #print(i, j) img2[x, y] = 2 #plt.close() #plt.imshow(img2) #plt.show() return len(maxPath)
def __context_text_to_vec(self, context_instance): found_word = False if self.embeddings != None: dimensionality = self.embeddings.dimension() weight_dtype = np.float32 w2ind = self.w2i text_matrix = np.zeros((dimensionality,), dtype=weight_dtype) else: dimensionality = len(self.w2i) weight_dtype = np.float32 if self.args.tfidf else np.int8 w2ind = self.w2i text_matrix = dok_matrix((dimensionality,1), dtype=weight_dtype) context_text_tokens = context_instance.get_context_tokens() target_pos = context_instance.target_ind if (self.bow_size > 0): start_pos = max(target_pos-self.bow_size, 0) end_pos = min(target_pos+self.bow_size+1, len(context_text_tokens)) context_text_tokens = context_text_tokens[start_pos:end_pos] target_pos = target_pos-start_pos stopwords = self.stopwords context_text_inds_left = [w2ind[word] for word in context_text_tokens[:target_pos] if word not in stopwords and word in w2ind] context_text_inds_right = [w2ind[word] for word in context_text_tokens[target_pos+1:] if word not in stopwords and word in w2ind] if (target_pos+1) < len(context_text_tokens) else [] all_words_inds = context_text_inds_left+context_text_inds_right total_weights = 0.0 for word_ind in all_words_inds: w = self.i2w[word_ind] if self.args.tfidf: wcount = self.w2counts[w] log_idf = math.log(float(self.sum_word_counts)/wcount) log_idf -= self.args.tfidf_offset if (log_idf <= self.args.tfidf_threshold): log_idf = 0.0 weight = log_idf else: weight = 1 if weight !=0: found_word = True if (self.embeddings != None): if w in self.embeddings: wordvec = self.embeddings.represent(w).transpose() text_matrix = text_matrix + (wordvec * weight) else: weight = 0.0 else: text_matrix[word_ind,0] += weight total_weights += weight # embeddings representations are always normalized if (self.embeddings != None): if total_weights != 0: text_matrix /= total_weights norm = np.sqrt(np.sum(text_matrix*text_matrix)) if norm != 0: text_matrix /= norm return text_matrix, found_word
def estimate_ppc_roi(im, tissue_contours, glomeruli_centers, show=False): """Draw the ROI on a low magnification image of WSI given contours of the tissue and glomeruli centers in the image. Makes use of the Dijkstra approach and skeletonize on the tissue centers to draw the glomeruli. Note that for this method it is more beneficial to use lower magnification, such as 0.25, instead of the standard low magnification of range 1.25. Source: https://stackoverflow.com/questions/43698577/calculating-the-shortest-path-between-two-points-in-a-bitmap-in -python Parameters ---------- im : np.ndarray RGB image of tissue at low resolution tissue_contours : list opencv style contours of the tissue in the image glomeruli_centers : list list of glemeruli (x, y) centers show : bool (optional) set to True to plot some of the results Return ------ roi_contours : list opencv style contours of the ROI in the image """ roi_contours = [] # run through each individual tissue for tissue_contour in tissue_contours: # draw the tissue contour tissue_mask = cv.drawContours(np.zeros(im.shape[:-1]), [tissue_contour], -1, 1, cv.FILLED) # blur the image but force values between 0 and 1 tissue_mask = cv.GaussianBlur(tissue_mask, (5, 5), 0) tissue_mask = (tissue_mask > 0.).astype(np.uint8) # find the glomeruli centers that fall within this tissue contour tissue_glom_centers = [] for center in glomeruli_centers: if tissue_mask[center[1], center[0]]: tissue_glom_centers.append(center) # get the skeleton of the mask skeleton = skeletonize(tissue_mask).astype(np.uint8) # get the x, y coordinates of the skeleton rows, cols = np.where(skeleton) # find the closest skeleton point for each glomeruli in tissue closest_points = [] for center in tissue_glom_centers: distances = [] for x, y in zip(cols, rows): distances.append(get_euclidean(center, (x, y))) # find index of smallest distance i = distances.index(min(distances)) # add the closest point as (x, y) closest_points.append([cols[i], rows[i]]) # need at least 2 glomeruli to draw the roi in a tissue if len(closest_points) < 2: continue # Converting skeleton mask to graph problem to apply Dijkstra method to find shortest path def to_index(y, x): # translation from 2 coordinates to a single number return y * skeleton.shape[1] + x def to_coordinates(index): # define the reversed translation from index to 2 coordinates return index / skeleton.shape[1], index % skeleton.shape[1] # build sparse adjacency matrix - two pixels are adjacent in the graph if both are painted adjacency = dok_matrix((skeleton.shape[0] * skeleton.shape[1], skeleton.shape[0] * skeleton.shape[1]), dtype=bool) # the following lines fills the adjacency matrix by directions = list(itertools.product([0, 1, -1], [0, 1, -1])) for i in range(1, skeleton.shape[0] - 1): for j in range(1, skeleton.shape[1] - 1): if not skeleton[i, j]: continue for y_diff, x_diff in directions: if skeleton[i + y_diff, j + x_diff]: adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = True # convert all the closest points (x, y) to single value, these are known as sources sources = [to_index(source[1], source[0]) for source in closest_points] # calculate the distant matrix from each source to all possible values in image dist_matrix, predecessors = dijkstra(adjacency, directed=False, indices=sources, unweighted=True, return_predecessors=True) # find the two pairs of sources that are farthest away from each other combination = list(combinations(range(len(closest_points)), 2)) distances = [] for c in combination: distances.append(dist_matrix[c[0], sources[c[1]]]) # find the index with largest value, these indices belong to the sources max_combination = combination[distances.index(max(distances))] # constructs the path between source and target (the pair of sources that are farthest away from each other) source = sources[max_combination[0]] target = sources[max_combination[1]] pixel_index = target pixels_path = [] while pixel_index != source: pixels_path.append(pixel_index) pixel_index = predecessors[max_combination[0], pixel_index] # create a blank mask to draw only the part of the skeleton connecting the source and target roi_mask = Image.new('L', (im.shape[1], im.shape[0])) skeleton_points = [] for pixel_index in pixels_path: i, j = to_coordinates(pixel_index) skeleton_points.append((int(j), int(i))) # im[int(i), int(j)] = [255, 0, 0] # use pillow to draw the line with width draw = ImageDraw.ImageDraw(roi_mask) draw.line(skeleton_points, fill=255, width=40, joint='curve') roi_mask = np.array(roi_mask) # the width might be too large so do a bit and operation with the tissue mask to remove edges roi_mask = cv.bitwise_and(roi_mask, tissue_mask) # extract the roi contours roi_contour, _ = cv.findContours(roi_mask, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_TC89_KCOS) # if the points are too close together at low resolution, then there will be no contours to draw, skip these if len(roi_contour) > 0: # append the first contours roi_contours.append(roi_contour[0]) if show: tissue_mask = cv.drawContours(np.zeros(im.shape[:-1]), tissue_contours, -1, 1, cv.FILLED) roi_mask = cv.drawContours(np.zeros(im.shape[:-1]), roi_contours, -1, 1, cv.FILLED) im_with_roi = cv.drawContours(im.copy(), roi_contours, -1, [255, 0, 0], 2) # plot the original image, tissue mask, and roi_mask, draw the roi contous on original image fig, ax = plt.subplots(ncols=3, figsize=(10, 5)) ax[0].imshow(im_with_roi) ax[0].set_title('Image with ROI contours', fontsize=14) ax[1].imshow(tissue_mask) ax[1].set_title('Tissue Mask', fontsize=14) ax[2].imshow(roi_mask) ax[2].set_title('ROI Mask', fontsize=14) plt.show() return roi_contours
def __context_text_to_vec(self, context_instance): found_word = False if self.embeddings != None: dimensionality = self.embeddings.dimension() weight_dtype = np.float32 w2ind = self.w2i text_matrix = np.zeros((dimensionality, ), dtype=weight_dtype) else: dimensionality = len(self.w2i) weight_dtype = np.float32 if self.args.tfidf else np.int8 w2ind = self.w2i text_matrix = dok_matrix((dimensionality, 1), dtype=weight_dtype) context_text_tokens = context_instance.get_context_tokens() target_pos = context_instance.target_ind if (self.bow_size > 0): start_pos = max(target_pos - self.bow_size, 0) end_pos = min(target_pos + self.bow_size + 1, len(context_text_tokens)) context_text_tokens = context_text_tokens[start_pos:end_pos] target_pos = target_pos - start_pos stopwords = self.stopwords context_text_inds_left = [ w2ind[word] for word in context_text_tokens[:target_pos] if word not in stopwords and word in w2ind ] context_text_inds_right = [ w2ind[word] for word in context_text_tokens[target_pos + 1:] if word not in stopwords and word in w2ind ] if (target_pos + 1) < len(context_text_tokens) else [] all_words_inds = context_text_inds_left + context_text_inds_right total_weights = 0.0 for word_ind in all_words_inds: w = self.i2w[word_ind] if self.args.tfidf: wcount = self.w2counts[w] log_idf = math.log(float(self.sum_word_counts) / wcount) log_idf -= self.args.tfidf_offset if (log_idf <= self.args.tfidf_threshold): log_idf = 0.0 weight = log_idf else: weight = 1 if weight != 0: found_word = True if (self.embeddings != None): if w in self.embeddings: wordvec = self.embeddings.represent(w).transpose() text_matrix = text_matrix + (wordvec * weight) else: weight = 0.0 else: text_matrix[word_ind, 0] += weight total_weights += weight # embeddings representations are always normalized if (self.embeddings != None): if total_weights != 0: text_matrix /= total_weights norm = np.sqrt(np.sum(text_matrix * text_matrix)) if norm != 0: text_matrix /= norm return text_matrix, found_word
def cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter, min_avg_cluster_size, subvec_filename, cluster_filename): ''' kmeans clustering of subvecs given in an input file :param w2i: word2index :param cluster_prunning: max size of a cluster centroid :param K: number of clusters :param ninit: number of repeating tries :param maxiter: number of clustering iterations :param min_avg_cluster_size: min size of clusters (on average) :param subvec_filename: input filename :param cluster_filename: output filename :returns: None ''' if os.path.exists(cluster_filename): print "NOTICE: cluster file %s already exists. skipping." % cluster_filename return subvec_file = open(subvec_filename, 'r') subvec_num = sum( 1 for line in subvec_file) / 2 #subvec is on every second line subvec_file.seek(0) minK = min(subvec_num / min_avg_cluster_size, K) minK = max(1, minK) cluster_file = open(cluster_filename, 'w') print "Clustering subvecs in file %s. Using K=%d\n" % (cluster_filename, minK) target = subvec_filename[subvec_filename.rfind('/') + 1:] subs_matrix = dok_matrix((subvec_num, len(w2i)), dtype=np.float32) line = 0 try: while True: context_inst, subvec = read_context(subvec_file) normalize_subvec(subvec) for word, weight in subvec: if (weight != 0): subs_matrix[line, w2i[word]] = weight line += 1 if line % 10000 == 0: sys.stderr.write("Read %d subvecs\n" % (line)) except EOFError: sys.stderr.write("Finished loading %d context lines\n" % line) subs_matrix = subs_matrix.tocsr() best_centroids = None best_inertia = None for init_iter in xrange(0, ninit): kmeans = KMeans(init='k-means++', n_clusters=minK, n_init=1, max_iter=1) kmeans.fit(subs_matrix) centroids = kmeans.cluster_centers_ normalize_centroids(centroids) for iter in xrange(1, maxiter): kmeans = KMeans(init=centroids, n_clusters=minK, n_init=1, max_iter=1) kmeans.fit(subs_matrix) centroids = kmeans.cluster_centers_ normalize_centroids(centroids) inertia = kmeans.inertia_ if best_centroids is None or inertia < best_inertia: best_inertia = inertia best_centroids = centroids for j in xrange(0, len(best_centroids)): cluster_vec = [(i2w[i], weight) for (i, weight) in enumerate(best_centroids[j, :]) if weight != 0] cluster_vec = sorted(cluster_vec, key=itemgetter(1), reverse=True)[:cluster_prunning] norm = sum([weight**2 for word, weight in cluster_vec])**0.5 cluster_vec = [(word, weight / norm) for word, weight in cluster_vec] norm = sum([weight**2 for word, weight in cluster_vec])**0.5 cluster_file.write(target + "\t" + str(j) + "\t0\t" + target + "\tCLUSTER\t norm verified = " + '{0:1.8f}'.format(norm) + "\tpruning factor = " + str(cluster_prunning) + "\n") for (word, weight) in cluster_vec: cluster_file.write(' '.join([word, '{0:1.8f}'.format(weight)]) + '\t') cluster_file.write('\n') subvec_file.close() cluster_file.close()
img = original_img[:, :, 0] + original_img[:, :, 1] + original_img[:, :, 2] # Defines a translation from 2 coordinates to a single number def to_index(y, x): return y * img.shape[1] + x # Defines a reversed translation from index to 2 coordinates def to_coordinates(index): return index / img.shape[1], index % img.shape[1] # A sparse adjacency matrix. # Two pixels are adjacent in the graph if both are painted. adjacency = dok_matrix((img.shape[0] * img.shape[1], img.shape[0] * img.shape[1]), dtype=bool) # The following lines fills the adjacency matrix by directions = list(itertools.product([0, 1, -1], [0, 1, -1])) for i in range(1, img.shape[0] - 1): for j in range(1, img.shape[1] - 1): if not img[i, j]: continue for y_diff, x_diff in directions: if img[i + y_diff, j + x_diff]: adjacency[to_index(i, j), to_index(i + y_diff, j + x_diff)] = True # We chose two arbitrary points, which we know are connected source = to_index(14, 47)
def avg_contexts(self, ref_subvec, top, top_percent, top_inferences_number, exclude_ref, weights_factor): ''' Performs a weighted average of :param ref_subvec: given subvec as a numpy matrix :param top: :param top_percent: :param top_inferences_number: :param exclude_ref: :param weights_factor: :returns: parvec, number of contexts averaged ''' if len(self.contexts) == 0: return None, 0 ref_weight = 1 if exclude_ref == False else 0 if (top > len(self.contexts) + ref_weight): top = len(self.contexts) + ref_weight if (top > 0 or top_percent > 0): top_contexts_weights = self.sim_scores.todok() final_top = top - ref_weight # -1 to leave 1 for the ref_subvec num_top_percent = int( math.ceil(top_percent * (len(self.contexts) + ref_weight))) - ref_weight final_top = max(final_top, num_top_percent) cw_sorted = heapq.nlargest(final_top, top_contexts_weights.iteritems(), key=lambda x: x[1]) top_contexts_weights = dok_matrix((len(self.contexts), 1), dtype=np.float32) for (k, j), weight in cw_sorted: top_contexts_weights[k, j] = weight**weights_factor top_contexts_weights = top_contexts_weights.tocsr() contexts_num = len(cw_sorted) else: contexts_num = len(self.contexts) if weights_factor == 0.0: top_contexts_weights = dok_matrix([[1.0] * contexts_num ]).tocsr().transpose() else: top_contexts_weights = self.sim_scores.copy() top_contexts_weights.data **= weights_factor sum_weights = top_contexts_weights.sum( ) + ref_weight #weight +1 reserved for ref_subvec top_contexts_weights.data /= sum_weights weighted_subs_matrix = self.subs_matrix.multiply( top_contexts_weights) #NOT SUPPORTED IN SCIPY 0.7 avg_subvec = weighted_subs_matrix.sum(axis=0) if (exclude_ref == False) and (ref_subvec != None): ref_subvec.data *= 1.0 / sum_weights avg_subvec = avg_subvec + ref_weight * ref_subvec.transpose() result_vec = self.__vec_to_sorted_list(avg_subvec, top_inferences_number) return result_vec, contexts_num
def __init__(self, pset=None): self.__S = dok.dok_matrix((1, 1), dtype=np.byte) if pset is not None: self.pset = pset