Exemple #1
0
def get_all_data():
    path_base = "/home/net/xiaodong/trax/trax/hlo_module/{}/{}_op_fusion_level_{}_tensor_fusion_threshold_{}/{}"

    module_name = "training.module_0061.after_all_reduce_combiner.hlo.pb"
    time_name = "per_iteration_time.txt"
    worker_num = 6
    model_name = "transformer"
    op_levels = [0, 1, 2, 3]
    tensor_thresholds = [0, 20, 30, 40, 60, 80, 90, 120, 240, 10000000]
    training_datas = []
    for op_level in op_levels:
        for tensor_threshold in tensor_thresholds:
            proto_path = path_base.format(worker_num, model_name, op_level,
                                          tensor_threshold, module_name)
            time_path = path_base.format(worker_num, model_name, op_level,
                                         tensor_threshold, time_name)

            if os.path.exists(proto_path) and os.path.exists(time_path):
                with open(proto_path, "rb") as f:
                    hlo_proto = hlo_pb2.HloProto()
                    hlo_proto.ParseFromString(f.read())
                    hlo_module = hlo_proto.hlo_module
                    res = gen_data(hlo_module)
                with open(time_path, "r") as f:
                    first_line = f.readline()
                    for last_line in f:
                        pass
                    time = float(last_line.split(":")[-1])
                    res["execution_time"] = time
                training_datas.append(res)
    op_levels = [888]
    tensor_thresholds = range(150)
    for op_level in op_levels:
        for tensor_threshold in tensor_thresholds:
            proto_path = path_base.format(worker_num, model_name, op_level,
                                          tensor_threshold, module_name)
            time_path = path_base.format(worker_num, model_name, op_level,
                                         tensor_threshold, time_name)

            if os.path.exists(proto_path) and os.path.exists(time_path):
                with open(proto_path, "rb") as f:
                    hlo_proto = hlo_pb2.HloProto()
                    hlo_proto.ParseFromString(f.read())
                    hlo_module = hlo_proto.hlo_module
                    res = gen_data(hlo_module)
                with open(time_path, "r") as f:
                    first_line = f.readline()
                    for last_line in f:
                        pass
                    time = float(last_line.split(":")[-1])
                    res["execution_time"] = time
                training_datas.append(res)

    return training_datas
Exemple #2
0
def test_empty_proto():
    """Build from an empty proto."""
    proto = hlo_pb2.HloProto()
    with test.Raises(ValueError) as e_ctx:
        xla.BuildProgramGraphProto(proto)

    assert "Failed to locate entry computation" in str(e_ctx.value)
Exemple #3
0
def launch():
    if request.method == 'POST':
        data = request.get_data()
        hlo_module = hlo_pb2.HloProto()
        hlo_module.ParseFromString(data)
        res = gen_data(hlo_module.hlo_module)
        instruction_feats = tf.convert_to_tensor(res["instruction_feats"], dtype=tf.float32)
        computation_feats = tf.convert_to_tensor(res["computation_feats"], dtype=tf.float32)
        final_feats = tf.convert_to_tensor(res["final_feats"], dtype=tf.float32)
        instruction_edge_feats = tf.convert_to_tensor(res["instruction_edge_feats"], dtype=tf.float32)
        call_computation_edge_feats = tf.convert_to_tensor(res["call_computation_edge_feats"],
                                                           dtype=tf.float32)
        in_computation_edge_feats = tf.convert_to_tensor(res["in_computation_edge_feats"], dtype=tf.float32)
        to_final_edge_feats = tf.convert_to_tensor(res["to_final_edge_feats"], dtype=tf.float32)

        input = [instruction_feats, computation_feats, final_feats, instruction_edge_feats,call_computation_edge_feats, in_computation_edge_feats, to_final_edge_feats]
        graph = res["graph"]
        my_lock.acquire()
        model.set_graph(graph)
        ranklogit = model(input, training=False)
        my_lock.release()
        ranklogit = tf.math.reduce_mean(ranklogit).numpy()

        req = {
            "code": "0000",
            "result": str(ranklogit),
        }
        return str(ranklogit)
Exemple #4
0
    def test_accuracy(self,hlo_model=None):
        if hlo_model==None:
            hlo_model = self.init_hlo_module
        else:
            with open(hlo_model, "rb") as f:
                hlo_proto = hlo_pb2.HloProto()
                hlo_proto.ParseFromString(f.read())
                hlo_model = hlo_proto.hlo_module
        estimated_time = self.estimate_time(hlo_model)
        print("estimated_time withgnn:",estimated_time)

        estimated_time,all_reduce_time = self.estimate_time_without_gnn(hlo_model)
        print("estimated_time without gnn:",estimated_time)
        print("all-reduce_time without gnn:",all_reduce_time)
Exemple #5
0
def launch():
    if request.method == 'POST':
        print("get request")
        data = request.get_data()
        hlo_proto = hlo_pb2.HloProto()
        hlo_proto.ParseFromString(data)
        hlo_module = hlo_proto.hlo_module

        with my_lock:
            try:
                estimated_time = cost_model.estimate_time(hlo_module)
            except Exception as err:
                print(err)
                estimated_time = 0
        print("send request")
        return str(estimated_time)
Exemple #6
0
def get_cost_model(profiler_path="training_hlo_execution_profile_data",
                   hlo_module_path="training.hlo.pb"):

    try:
        cost_model = load("cost_model.pkl")
    except:
        cost_model = {}
        cost_model["InstructionName_Time_Dict"] = {}
        cost_model["InstructionName_Time_Dict"] = {}
    InstructionName_Time_Dict = cost_model["InstructionName_Time_Dict"]
    Tuple_Time_Dict = cost_model["InstructionName_Time_Dict"]

    if os.path.exists(hlo_module_path) and os.path.exists(profiler_path):
        with open(profiler_path, "rb") as f:
            profile_def = profiler_pb2.HloExecutionProfileData()
            profile_def.ParseFromString(f.read())
        with open(hlo_module_path, "rb") as f:
            hlo_proto = hlo_pb2.HloProto()
            hlo_proto.ParseFromString(f.read())
            hlo_module = hlo_proto.hlo_module

    Name_Instruction_Dict = {}
    for computation in (hlo_module.computations):
        for instruction in computation.instructions:
            Name_Instruction_Dict[instruction.name] = instruction

    printer_data = profile_def.printer_data
    profiler_counters = profile_def.profile_counters
    for computation_info in printer_data.computation_infos:
        for instruction_info in computation_info.instruction_infos:
            instruction_name = instruction_info.short_name.split(
                " ")[0].strip()[1:]

            exe_time = (profiler_counters[instruction_info.profile_index] /
                        1.6325) / 1000  #us
            #using name as key
            InstructionName_Time_Dict[instruction_name] = exe_time
            #using op type and shape tuple as key
            instruction = Name_Instruction_Dict[instruction_name]
            opcode = instruction.opcode
            shape = get_shape_string(instruction)
            Tuple_Time_Dict[(opcode, shape)] = exe_time

    save(cost_model, "cost_model.pkl")
    return InstructionName_Time_Dict, Tuple_Time_Dict, hlo_module
Exemple #7
0
def get_train_single_data():

    module_name = "training.hlo.pb"
    profiler_name = "training_hlo_execution_profile_data"
    worker_num = 6
    model_name = "transformer"
    training_datas = []

    hlo_module_path = module_name
    profiler_path = profiler_name

    if os.path.exists(hlo_module_path) and os.path.exists(profiler_path):
        with open(profiler_path, "rb") as f:
            profiledata = profiler_pb2.HloExecutionProfileData()
            profiledata.ParseFromString(f.read())
        with open(hlo_module_path, "rb") as f:
            hlo_proto = hlo_pb2.HloProto()
            hlo_proto.ParseFromString(f.read())
            hlo_module = hlo_proto.hlo_module
        res = gen_data_from_hlo_def(hlo_module, profiledata)
        training_datas.extend(res)
    print("training data length:", len(training_datas))

    return training_datas
Exemple #8
0
def test_non_empty_proto():
    """Build a graph proto from an example proto."""
    proto = pbutil.FromFile(TEST_PROTO, hlo_pb2.HloProto())
    graph = xla.BuildProgramGraphProto(proto)
    assert len(graph.node) == 155
    assert len(graph.function) == 5
Exemple #9
0
        for tensor_threshold in tensor_thresholds:
            proto_path = path_base.format(worker_num, model_name, op_level,
                                          tensor_threshold, module_name)
            time_path = path_base.format(worker_num, model_name, op_level,
                                         tensor_threshold, time_name)

            if os.path.exists(proto_path) and os.path.exists(time_path):
                with open(proto_path, "rb") as f:
                    hlo_proto = hlo_pb2.HloProto()
                    hlo_proto.ParseFromString(f.read())
                    hlo_module = hlo_proto.hlo_module
                    res = gen_data(hlo_module)
                with open(time_path, "r") as f:
                    first_line = f.readline()
                    for last_line in f:
                        pass
                    time = float(last_line.split(":")[-1])
                    res["execution_time"] = time
                training_datas.append(res)
    return training_datas


if __name__ == "__main__":
    files = ["3_60_after_optimizations.hlo"]
    for file in files:
        with open(file + ".pb", "rb") as f:
            hlo_module = hlo_pb2.HloProto()
            hlo_module.ParseFromString(f.read())
            res = gen_data(hlo_module.hlo_module)
            print(res)
Exemple #10
0
def test_non_empty_proto_to_networkx():
  """Build a networkx graph from an example proto."""
  proto = pbutil.FromFile(TEST_PROTO, hlo_pb2.HloProto())
  graph = xla2graph.BuildProgramGraphNetworkX(proto)
  assert graph.number_of_nodes() == 155