Example #1
0
def test_merge_matrix_col_wise():
    a = Matrix.from_values(
        [0, 0, 1, 2, 1],
        [0, 1, 2, 1, 3],
        [1, 2, 3, 4, 5],
    )
    b = Matrix.from_values([0, 2], [0, 1], [6, 7])
    expected_result = Matrix.from_values(
        [0, 0, 1, 2, 1, 0, 2],
        [0, 1, 2, 1, 3, 4, 5],
        [1, 2, 3, 4, 5, 6, 7],
    )

    result = merge_matrix(a, b, row_wise=False, create_new=True)

    assert id(result) != id(a)  # 'result' and 'a' should be different objects
    assert result.isequal(expected_result)

    result = merge_matrix(a, b, row_wise=False, create_new=False)
    assert id(result) == id(a)  # 'result' and 'a' should be the same object
    assert result.isequal(expected_result)
Example #2
0
def calc(data_dir, country_name):
    # init timer
    logger = Logger()

    # load vertices
    loader = Loader(data_dir)

    persons = loader.load_empty_vertex('person')
    places = loader.load_vertex('place', is_dynamic=False, column_names=['name', 'type'])

    # print("Vertices persons and places\t%s" % logger.get_total_time(), file=stderr)

    # load edges
    person_locatedin_place = loader.load_edge(persons, 'isLocatedIn', places, is_dynamic=True)
    place_ispartof_place = loader.load_edge(places, 'isPartOf', places, is_dynamic=False)
    # print("Loaded locatedIn and isPartOf edges\t%s" % logger.get_total_time(), file=stderr)

    # get country index
    country_index = places.data.index([country_name, 'country'])
    country_vector = Vector.from_values([country_index], [True], size=place_ispartof_place.ncols)

    # cities mapped to original ids
    country_vector_indices, _ = country_vector.vxm(place_ispartof_place.T).new().to_values()
    country_city_matrix = Matrix.from_values(country_vector_indices, country_vector_indices, repeat(1, len(country_vector_indices)), nrows=places.length, ncols=persons.length)
    person_mask, _ = person_locatedin_place.mxm(country_city_matrix).new().reduce_rows().new().to_values()
    person_mask = set(person_mask)

    # print("Created person mask\t%s" % logger.get_total_time(), file=stderr)

    # load person-knows-person for people located in 'country'
    person_knows_person = loader.load_edge(persons, 'knows', persons, is_dynamic=True, lmask=person_mask, rmask=person_mask, undirected=True)

    logger.loading_finished()

    # calculate triangles
    r = person_knows_person.mxm(person_knows_person).new()
    r << r.ewise_mult(person_knows_person)
    triangle_count = r.reduce_rows().new().reduce().new().value // 6

    logger.calculation_finished()
    # print("Triangles calculated. All done\t%s" % logger.get_total_time(), file=stderr)

    print(triangle_count)
Example #3
0
def merge_matrix(a: Matrix, b: Matrix, *, create_new=False, row_wise=True):
    """
    Creates a matrix using matrices 'a' and 'b'. If 'create_new' is false, 'a' will be overwritten,
    otherwise a new matrix will be created.

    :param a:
    :param b:
    :param create_new:
    :param row_wise: if True 'b' matrix will be added as rows to 'a', otherwise as columns.
    :return:
    """

    result = a.dup() if create_new else a

    if row_wise:
        if a.ncols != b.ncols:
            raise ValueError(
                f"Row-wise merge is not possible as a.ncols != b.ncols. "
                f"{a.ncols} != {b.ncols}")

        a_nrows = a.nrows

        result.resize(a.nrows + b.nrows, a.ncols)
        result[a_nrows:a_nrows + b.nrows, :] = b

    else:
        if a.nrows != b.nrows:
            raise ValueError(
                f"Row-wise merge is not possible as a.nrows != b.nrows. "
                f"{a.nrows} != {b.nrows}")

        a_ncols = a.ncols

        result.resize(a.nrows, a.ncols + b.ncols)
        result[:, a_ncols:a_ncols + b.ncols] = b

    return result
Example #4
0
    def load_edge(self,
                  from_vertex_type: VertexType,
                  edge_name: str,
                  to_vertex_type: VertexType,
                  *,
                  is_dynamic: bool,
                  dtype=dtypes.INT32,
                  lmask=None,
                  rmask=None,
                  undirected=False,
                  from_id_header_override=None,
                  to_id_header_override=None):
        """
        Loads edges between of type 'edge_name' between 'from_vertex_type' and 'to_vertex_type'. These parameters
        also define the csv file that will be loaded.

        When using rmask or rmask not all entries will be loaded only the ones that have an index (and not id!)
        already present in the mask tuple. This method assumes that the corresponding id should already be present
        in the mapping. This results in matrix dimensions that correspond to the number of matching elements,
        instead of the number of all elements. i.e. if only 1 entry matches the lmask out of a 1000, the matrix will
        only have 1 row instead of 1000.

        TODO: add parsing of properties of a relation.
        :param edge_name:
        :param from_vertex_type:
        :param to_vertex_type:
        :param is_dynamic:
        :return: adjacency matrix
        """

        # concat full filename for input file
        filename = "%s_%s_%s%s" % (from_vertex_type.name, edge_name,
                                   to_vertex_type.name, self.filename_suffix)

        # concat file path
        subdir = 'dynamic' if is_dynamic else 'static'
        file_path = path.join(self.data_dir, subdir, filename)

        if not path.isfile(file_path):
            raise LoadError(
                "(%s)-[:%s]-(%s) connection doesn't exist." %
                (from_vertex_type.name, edge_name, to_vertex_type.name))

        with open(file_path) as csvfile:
            reader = csv.reader(csvfile,
                                delimiter=DEFAULT_DELIMITER,
                                quotechar=DEFAULT_QUOTE)

            # get id columns
            # todo: if attributes are needed, column_names should be a function parameter and
            # todo: these values should be inserted into that
            column_names = [
                from_id_header_override or f'{from_vertex_type.name}.id',
                to_id_header_override or f'{to_vertex_type.name}.id',
            ]

            header = next(reader)
            columns = self._parse_header(header, column_names)

            from_indexes = []
            to_indexes = []

            for i, row in enumerate(reader):
                row_data = [row[i] for i in columns]
                id_from = int(row_data.pop(0))
                id_to = int(row_data.pop(0))

                from_index = None
                if lmask is not None:
                    # if a mask is present auto creation of mapping doesn't make sense, because the mask already
                    # assumes an index-id mapping
                    from_index = from_vertex_type.id2index(id_from,
                                                           auto_create=False)
                    if from_index is None or from_index not in lmask:
                        continue

                to_index = None
                if rmask is not None:
                    # if a mask is present auto creation of mapping doesn't make sense, because the mask already
                    # assumes an index-id mapping
                    to_index = to_vertex_type.id2index(id_to,
                                                       auto_create=False)
                    if to_index is None or to_index not in rmask:
                        continue

                # because of the auto_create=False, at this point to_index and/or from_index may be None.
                # let's make sure they are valid.
                to_index = to_index or to_vertex_type.id2index(
                    id_to, auto_create=True)
                from_index = from_index or from_vertex_type.id2index(
                    id_from, auto_create=True)

                from_indexes.append(from_index)
                to_indexes.append(to_index)

                # if there's anything else to store
                if row_data:
                    # todo: save additional properties connected to the edge.
                    pass

        m = Matrix.from_values(
            from_indexes,
            to_indexes,
            repeat(1, len(from_indexes)),  # 1 for all value
            nrows=from_vertex_type.length,
            ncols=to_vertex_type.length,
            dtype=dtype,
            name="%s_%s_%s" %
            (from_vertex_type.name, edge_name, to_vertex_type.name))

        if undirected:
            m << m.ewise_add(m.T)

        return m
Example #5
0
def calc(data_dir, city1_id, city2_id):
    city1_id = int(city1_id)
    city2_id = int(city2_id)

    # init timer
    logger = Logger()

    # load vertices
    loader = Loader(data_dir)

    persons = loader.load_empty_vertex('person')
    places = loader.load_empty_vertex('place')
    comments = loader.load_empty_vertex('comment')
    posts = loader.load_empty_vertex('post')

    city1_index = places.id2index(city1_id)
    city2_index = places.id2index(city2_id)

    # print("Vertices loaded\t%s" % logger.get_total_time(), file=stderr)

    # load edges
    person_knows_person = loader.load_edge(
        persons,
        'knows',
        persons,
        is_dynamic=True,
        undirected=True,
        from_id_header_override='Person1.id',
        to_id_header_override='Person2.id')
    person_locatedin_city = loader.load_edge(persons,
                                             'isLocatedIn',
                                             places,
                                             is_dynamic=True,
                                             rmask={city1_index, city2_index})

    persons_in_city1, _ = person_locatedin_city[:,
                                                city1_index].new().to_values()
    persons_in_city2, _ = person_locatedin_city[:,
                                                city2_index].new().to_values()

    # create a matrix containing message-hascreator-person relation, which contains both posts and comments
    # fixme: does it worth at all to create these message-hascreator and replyof-message matrices...?
    # fixme: This could be solved by multiplying them separately.
    comment_hascreator_person = loader.load_edge(comments,
                                                 'hasCreator',
                                                 persons,
                                                 is_dynamic=True)
    post_hascreator_person = loader.load_edge(posts,
                                              'hasCreator',
                                              persons,
                                              is_dynamic=True)

    # make sure to have the same person dimension length
    person_knows_person.resize(persons.length, persons.length)
    comment_hascreator_person.resize(comment_hascreator_person.nrows,
                                     persons.length)
    post_hascreator_person.resize(post_hascreator_person.nrows, persons.length)

    message_hascreator_person = comment_hascreator_person.dup()
    message_hascreator_person.resize(comments.length + posts.length,
                                     persons.length)
    message_hascreator_person[comments.length:comments.length +
                              posts.length, :] = post_hascreator_person

    # create a matrix containing comment-replyOf-message relation, which contains both posts and comments as parents
    comment_replyof_comment = loader.load_edge(
        comments,
        'replyOf',
        comments,
        is_dynamic=True,
        to_id_header_override='ParentComment.id')
    comment_replyof_post = loader.load_edge(
        comments,
        'replyOf',
        posts,
        is_dynamic=True,
        to_id_header_override='ParentPost.id')
    comment_replyof_messge = comment_replyof_comment.dup()
    comment_replyof_messge.resize(comments.length,
                                  comments.length + posts.length)
    comment_replyof_messge[:, comments.length:comments.length +
                           posts.length] = comment_replyof_post

    logger.loading_finished()

    # calculate weight matrix
    person_replyof_message = message_hascreator_person.T.mxm(
        comment_replyof_messge.T).new()
    person_weight_person = person_replyof_message.mxm(
        comment_hascreator_person).new(
            dtype=dtypes.FP32, mask=StructuralMask(person_knows_person))

    # make sure we have a square matrix. It can be different because not all person created replies or comments.
    person_weight_person.resize(persons.length, persons.length)

    # make weight matrix bidirectional
    person_weight_person << person_weight_person.ewise_add(
        person_weight_person.T)

    recipr = UnaryOp.register_anonymous(lambda x: 1 / x)
    person_weight_person << person_weight_person.apply(recipr)

    # calculate shortest path on person_weight_person using Floyd-Warshall alg.
    # set diagonal to 0
    for i in range(person_weight_person.ncols):
        person_weight_person[i, i] << 0

    # Batched Bellman-Ford algorithm for finding shortest path
    len_city1 = len(persons_in_city1)
    path_matrix = Matrix.from_values(range(len_city1),
                                     persons_in_city1,
                                     repeat(0, len_city1),
                                     ncols=persons.length,
                                     dtype=person_weight_person.dtype)

    prev_path_matrix = path_matrix.dup()
    while True:
        path_matrix << path_matrix.mxm(person_weight_person,
                                       op=semiring.min_plus)

        if path_matrix.isequal(prev_path_matrix):
            # break if a fix point is reached
            break

        prev_path_matrix = path_matrix.dup()

    # extract only people in city 2
    results = path_matrix[:, list(persons_in_city2)].new()

    result_tuples = [(persons.index2id(persons_in_city1[p1]),
                      persons.index2id(persons_in_city2[p2]), w)
                     for p1, p2, w in zip(*results.to_values())]

    # sort and print results
    # print("Result extracted, sorting...\t%s" % logger.get_total_time(), file=stderr)
    result_tuples = sorted(result_tuples, key=lambda x: (-x[2], x[0], x[1]))

    logger.calculation_finished()

    for pair in result_tuples:
        print(*pair)
Example #6
0
def calc(data_dir, person_id, tag_name):
    person_id = int(person_id)

    # init timer
    logger = Logger()

    # load vertices
    loader = Loader(data_dir)

    persons = loader.load_vertex('person', is_dynamic=True)
    person_vector = Vector.from_values([persons.id2index(person_id)], [True],
                                       size=persons.length)

    tags = loader.load_vertex('tag', is_dynamic=False, column_names=['name'])

    tag_index = tags.data.index([tag_name])
    tag_vector = Vector.from_values([tag_index], [True], size=tags.length)

    # print("Vertices loaded\t%s" % logger.get_total_time(), file=stderr)

    # load edges
    person_knows_person = loader.load_edge(persons,
                                           'knows',
                                           persons,
                                           is_dynamic=True,
                                           undirected=True)

    person_hasinterest_tag = loader.load_edge(persons,
                                              'hasInterest',
                                              tags,
                                              is_dynamic=True)

    # print("Edges loaded\t%s" % logger.get_total_time(), file=stderr)

    logger.loading_finished()

    # direct friends of given person
    friendsl1 = person_vector.vxm(person_knows_person).new()

    # get second level friends of given person, who are interested in given tag. (They should not be in friendsl1!)
    interested_persons = tag_vector.vxm(
        person_hasinterest_tag.T).new(mask=~StructuralMask(friendsl1))

    # manually remove the parameter person as he is interested in the given tag and is a friend of his friends
    del interested_persons[persons.id2index(person_id)]

    friendsl2 = friendsl1.vxm(person_knows_person).new(
        mask=StructuralMask(interested_persons))
    friendsl2_keys, _ = friendsl2.to_values()

    # calculate mutual friend count...
    # result_matrix start out as selection matrix for level2 friends
    result_matrix = Matrix.from_values(friendsl2_keys,
                                       friendsl2_keys,
                                       values=repeat(1, friendsl2.nvals),
                                       nrows=persons.length,
                                       ncols=persons.length)

    # get the corresponding friends for each level2 friend
    result_matrix << result_matrix.mxm(person_knows_person)

    # create a selection matrix for level1 friends
    friendsl1_keys, _ = friendsl1.to_values()
    friendsl1_matrix = Matrix.from_values(friendsl1_keys,
                                          friendsl1_keys,
                                          values=repeat(1, friendsl1.nvals),
                                          nrows=persons.length,
                                          ncols=persons.length)

    # filter for level1 friends (so we will have the mutual friends)
    result_matrix << result_matrix.mxm(friendsl1_matrix)

    # reduce rows to get count of mutual friends for each person and
    # create (person_index, count) tuples
    result_values = zip(*result_matrix.reduce_rows().new().to_values())

    # create final (person_id, count) tuples and sort them by count ASC, id DESC
    result = sorted(map(lambda x: (persons.index2id(x[0]), x[1]),
                        result_values),
                    key=lambda x: (-x[1], x[0]))

    logger.calculation_finished()

    # print top results
    for person_id, mutual_friend_count in islice(result, 20):
        print(person_id, mutual_friend_count)