def test_merge_matrix_col_wise(): a = Matrix.from_values( [0, 0, 1, 2, 1], [0, 1, 2, 1, 3], [1, 2, 3, 4, 5], ) b = Matrix.from_values([0, 2], [0, 1], [6, 7]) expected_result = Matrix.from_values( [0, 0, 1, 2, 1, 0, 2], [0, 1, 2, 1, 3, 4, 5], [1, 2, 3, 4, 5, 6, 7], ) result = merge_matrix(a, b, row_wise=False, create_new=True) assert id(result) != id(a) # 'result' and 'a' should be different objects assert result.isequal(expected_result) result = merge_matrix(a, b, row_wise=False, create_new=False) assert id(result) == id(a) # 'result' and 'a' should be the same object assert result.isequal(expected_result)
def calc(data_dir, country_name): # init timer logger = Logger() # load vertices loader = Loader(data_dir) persons = loader.load_empty_vertex('person') places = loader.load_vertex('place', is_dynamic=False, column_names=['name', 'type']) # print("Vertices persons and places\t%s" % logger.get_total_time(), file=stderr) # load edges person_locatedin_place = loader.load_edge(persons, 'isLocatedIn', places, is_dynamic=True) place_ispartof_place = loader.load_edge(places, 'isPartOf', places, is_dynamic=False) # print("Loaded locatedIn and isPartOf edges\t%s" % logger.get_total_time(), file=stderr) # get country index country_index = places.data.index([country_name, 'country']) country_vector = Vector.from_values([country_index], [True], size=place_ispartof_place.ncols) # cities mapped to original ids country_vector_indices, _ = country_vector.vxm(place_ispartof_place.T).new().to_values() country_city_matrix = Matrix.from_values(country_vector_indices, country_vector_indices, repeat(1, len(country_vector_indices)), nrows=places.length, ncols=persons.length) person_mask, _ = person_locatedin_place.mxm(country_city_matrix).new().reduce_rows().new().to_values() person_mask = set(person_mask) # print("Created person mask\t%s" % logger.get_total_time(), file=stderr) # load person-knows-person for people located in 'country' person_knows_person = loader.load_edge(persons, 'knows', persons, is_dynamic=True, lmask=person_mask, rmask=person_mask, undirected=True) logger.loading_finished() # calculate triangles r = person_knows_person.mxm(person_knows_person).new() r << r.ewise_mult(person_knows_person) triangle_count = r.reduce_rows().new().reduce().new().value // 6 logger.calculation_finished() # print("Triangles calculated. All done\t%s" % logger.get_total_time(), file=stderr) print(triangle_count)
def merge_matrix(a: Matrix, b: Matrix, *, create_new=False, row_wise=True): """ Creates a matrix using matrices 'a' and 'b'. If 'create_new' is false, 'a' will be overwritten, otherwise a new matrix will be created. :param a: :param b: :param create_new: :param row_wise: if True 'b' matrix will be added as rows to 'a', otherwise as columns. :return: """ result = a.dup() if create_new else a if row_wise: if a.ncols != b.ncols: raise ValueError( f"Row-wise merge is not possible as a.ncols != b.ncols. " f"{a.ncols} != {b.ncols}") a_nrows = a.nrows result.resize(a.nrows + b.nrows, a.ncols) result[a_nrows:a_nrows + b.nrows, :] = b else: if a.nrows != b.nrows: raise ValueError( f"Row-wise merge is not possible as a.nrows != b.nrows. " f"{a.nrows} != {b.nrows}") a_ncols = a.ncols result.resize(a.nrows, a.ncols + b.ncols) result[:, a_ncols:a_ncols + b.ncols] = b return result
def load_edge(self, from_vertex_type: VertexType, edge_name: str, to_vertex_type: VertexType, *, is_dynamic: bool, dtype=dtypes.INT32, lmask=None, rmask=None, undirected=False, from_id_header_override=None, to_id_header_override=None): """ Loads edges between of type 'edge_name' between 'from_vertex_type' and 'to_vertex_type'. These parameters also define the csv file that will be loaded. When using rmask or rmask not all entries will be loaded only the ones that have an index (and not id!) already present in the mask tuple. This method assumes that the corresponding id should already be present in the mapping. This results in matrix dimensions that correspond to the number of matching elements, instead of the number of all elements. i.e. if only 1 entry matches the lmask out of a 1000, the matrix will only have 1 row instead of 1000. TODO: add parsing of properties of a relation. :param edge_name: :param from_vertex_type: :param to_vertex_type: :param is_dynamic: :return: adjacency matrix """ # concat full filename for input file filename = "%s_%s_%s%s" % (from_vertex_type.name, edge_name, to_vertex_type.name, self.filename_suffix) # concat file path subdir = 'dynamic' if is_dynamic else 'static' file_path = path.join(self.data_dir, subdir, filename) if not path.isfile(file_path): raise LoadError( "(%s)-[:%s]-(%s) connection doesn't exist." % (from_vertex_type.name, edge_name, to_vertex_type.name)) with open(file_path) as csvfile: reader = csv.reader(csvfile, delimiter=DEFAULT_DELIMITER, quotechar=DEFAULT_QUOTE) # get id columns # todo: if attributes are needed, column_names should be a function parameter and # todo: these values should be inserted into that column_names = [ from_id_header_override or f'{from_vertex_type.name}.id', to_id_header_override or f'{to_vertex_type.name}.id', ] header = next(reader) columns = self._parse_header(header, column_names) from_indexes = [] to_indexes = [] for i, row in enumerate(reader): row_data = [row[i] for i in columns] id_from = int(row_data.pop(0)) id_to = int(row_data.pop(0)) from_index = None if lmask is not None: # if a mask is present auto creation of mapping doesn't make sense, because the mask already # assumes an index-id mapping from_index = from_vertex_type.id2index(id_from, auto_create=False) if from_index is None or from_index not in lmask: continue to_index = None if rmask is not None: # if a mask is present auto creation of mapping doesn't make sense, because the mask already # assumes an index-id mapping to_index = to_vertex_type.id2index(id_to, auto_create=False) if to_index is None or to_index not in rmask: continue # because of the auto_create=False, at this point to_index and/or from_index may be None. # let's make sure they are valid. to_index = to_index or to_vertex_type.id2index( id_to, auto_create=True) from_index = from_index or from_vertex_type.id2index( id_from, auto_create=True) from_indexes.append(from_index) to_indexes.append(to_index) # if there's anything else to store if row_data: # todo: save additional properties connected to the edge. pass m = Matrix.from_values( from_indexes, to_indexes, repeat(1, len(from_indexes)), # 1 for all value nrows=from_vertex_type.length, ncols=to_vertex_type.length, dtype=dtype, name="%s_%s_%s" % (from_vertex_type.name, edge_name, to_vertex_type.name)) if undirected: m << m.ewise_add(m.T) return m
def calc(data_dir, city1_id, city2_id): city1_id = int(city1_id) city2_id = int(city2_id) # init timer logger = Logger() # load vertices loader = Loader(data_dir) persons = loader.load_empty_vertex('person') places = loader.load_empty_vertex('place') comments = loader.load_empty_vertex('comment') posts = loader.load_empty_vertex('post') city1_index = places.id2index(city1_id) city2_index = places.id2index(city2_id) # print("Vertices loaded\t%s" % logger.get_total_time(), file=stderr) # load edges person_knows_person = loader.load_edge( persons, 'knows', persons, is_dynamic=True, undirected=True, from_id_header_override='Person1.id', to_id_header_override='Person2.id') person_locatedin_city = loader.load_edge(persons, 'isLocatedIn', places, is_dynamic=True, rmask={city1_index, city2_index}) persons_in_city1, _ = person_locatedin_city[:, city1_index].new().to_values() persons_in_city2, _ = person_locatedin_city[:, city2_index].new().to_values() # create a matrix containing message-hascreator-person relation, which contains both posts and comments # fixme: does it worth at all to create these message-hascreator and replyof-message matrices...? # fixme: This could be solved by multiplying them separately. comment_hascreator_person = loader.load_edge(comments, 'hasCreator', persons, is_dynamic=True) post_hascreator_person = loader.load_edge(posts, 'hasCreator', persons, is_dynamic=True) # make sure to have the same person dimension length person_knows_person.resize(persons.length, persons.length) comment_hascreator_person.resize(comment_hascreator_person.nrows, persons.length) post_hascreator_person.resize(post_hascreator_person.nrows, persons.length) message_hascreator_person = comment_hascreator_person.dup() message_hascreator_person.resize(comments.length + posts.length, persons.length) message_hascreator_person[comments.length:comments.length + posts.length, :] = post_hascreator_person # create a matrix containing comment-replyOf-message relation, which contains both posts and comments as parents comment_replyof_comment = loader.load_edge( comments, 'replyOf', comments, is_dynamic=True, to_id_header_override='ParentComment.id') comment_replyof_post = loader.load_edge( comments, 'replyOf', posts, is_dynamic=True, to_id_header_override='ParentPost.id') comment_replyof_messge = comment_replyof_comment.dup() comment_replyof_messge.resize(comments.length, comments.length + posts.length) comment_replyof_messge[:, comments.length:comments.length + posts.length] = comment_replyof_post logger.loading_finished() # calculate weight matrix person_replyof_message = message_hascreator_person.T.mxm( comment_replyof_messge.T).new() person_weight_person = person_replyof_message.mxm( comment_hascreator_person).new( dtype=dtypes.FP32, mask=StructuralMask(person_knows_person)) # make sure we have a square matrix. It can be different because not all person created replies or comments. person_weight_person.resize(persons.length, persons.length) # make weight matrix bidirectional person_weight_person << person_weight_person.ewise_add( person_weight_person.T) recipr = UnaryOp.register_anonymous(lambda x: 1 / x) person_weight_person << person_weight_person.apply(recipr) # calculate shortest path on person_weight_person using Floyd-Warshall alg. # set diagonal to 0 for i in range(person_weight_person.ncols): person_weight_person[i, i] << 0 # Batched Bellman-Ford algorithm for finding shortest path len_city1 = len(persons_in_city1) path_matrix = Matrix.from_values(range(len_city1), persons_in_city1, repeat(0, len_city1), ncols=persons.length, dtype=person_weight_person.dtype) prev_path_matrix = path_matrix.dup() while True: path_matrix << path_matrix.mxm(person_weight_person, op=semiring.min_plus) if path_matrix.isequal(prev_path_matrix): # break if a fix point is reached break prev_path_matrix = path_matrix.dup() # extract only people in city 2 results = path_matrix[:, list(persons_in_city2)].new() result_tuples = [(persons.index2id(persons_in_city1[p1]), persons.index2id(persons_in_city2[p2]), w) for p1, p2, w in zip(*results.to_values())] # sort and print results # print("Result extracted, sorting...\t%s" % logger.get_total_time(), file=stderr) result_tuples = sorted(result_tuples, key=lambda x: (-x[2], x[0], x[1])) logger.calculation_finished() for pair in result_tuples: print(*pair)
def calc(data_dir, person_id, tag_name): person_id = int(person_id) # init timer logger = Logger() # load vertices loader = Loader(data_dir) persons = loader.load_vertex('person', is_dynamic=True) person_vector = Vector.from_values([persons.id2index(person_id)], [True], size=persons.length) tags = loader.load_vertex('tag', is_dynamic=False, column_names=['name']) tag_index = tags.data.index([tag_name]) tag_vector = Vector.from_values([tag_index], [True], size=tags.length) # print("Vertices loaded\t%s" % logger.get_total_time(), file=stderr) # load edges person_knows_person = loader.load_edge(persons, 'knows', persons, is_dynamic=True, undirected=True) person_hasinterest_tag = loader.load_edge(persons, 'hasInterest', tags, is_dynamic=True) # print("Edges loaded\t%s" % logger.get_total_time(), file=stderr) logger.loading_finished() # direct friends of given person friendsl1 = person_vector.vxm(person_knows_person).new() # get second level friends of given person, who are interested in given tag. (They should not be in friendsl1!) interested_persons = tag_vector.vxm( person_hasinterest_tag.T).new(mask=~StructuralMask(friendsl1)) # manually remove the parameter person as he is interested in the given tag and is a friend of his friends del interested_persons[persons.id2index(person_id)] friendsl2 = friendsl1.vxm(person_knows_person).new( mask=StructuralMask(interested_persons)) friendsl2_keys, _ = friendsl2.to_values() # calculate mutual friend count... # result_matrix start out as selection matrix for level2 friends result_matrix = Matrix.from_values(friendsl2_keys, friendsl2_keys, values=repeat(1, friendsl2.nvals), nrows=persons.length, ncols=persons.length) # get the corresponding friends for each level2 friend result_matrix << result_matrix.mxm(person_knows_person) # create a selection matrix for level1 friends friendsl1_keys, _ = friendsl1.to_values() friendsl1_matrix = Matrix.from_values(friendsl1_keys, friendsl1_keys, values=repeat(1, friendsl1.nvals), nrows=persons.length, ncols=persons.length) # filter for level1 friends (so we will have the mutual friends) result_matrix << result_matrix.mxm(friendsl1_matrix) # reduce rows to get count of mutual friends for each person and # create (person_index, count) tuples result_values = zip(*result_matrix.reduce_rows().new().to_values()) # create final (person_id, count) tuples and sort them by count ASC, id DESC result = sorted(map(lambda x: (persons.index2id(x[0]), x[1]), result_values), key=lambda x: (-x[1], x[0])) logger.calculation_finished() # print top results for person_id, mutual_friend_count in islice(result, 20): print(person_id, mutual_friend_count)