Ejemplo n.º 1
0
def test_dnn_for_big_data(config_file):

    config = ConfigParser.ConfigParser()
    config.read(config_file)

    hadoop_bin = config.get("hadoop", 'bin')

    temp_dir = config.get('temp','temp_dir')

    sample_file_list = config.get("input", 'sample_file_list')
    frame_name = config.get("input", 'data_frame_name')
    chunk_size = int(config.get("input", 'chunk_size'))

    model_file_path = config.get("model", 'model_file_path')

    predict_file_path = config.get("output", 'predict_file_path')

    with open(model_file_path, 'r') as model_file:

        model_data = cPickle.load(model_file)

    parameter = model_data["parameter"]
    del model_data["parameter"]

    neuralnet = create_neuralnet(model_data)
    neuralnet.set_parameter(parameter)


    sample_file_paths = []
    with open(sample_file_list,'r') as f:
        for line in f:
            line = line.strip()
            if line:
                sample_file_paths.append(line)

    predict_file = open(predict_file_path, 'w')

    for file_path in sample_file_paths:

        if file_path.startswith("hdfs:"):
            local_file_path = download_file(hadoop_bin, file_path, temp_dir)
        else:
            local_file_path = file_path
        train_data_set = SupervisedDataSet(local_file_path, frame_name=frame_name)

        print time.ctime() + ":\tbegin predict with sample : " + remote_file_path

        for idx, (train_X, train_y_) in enumerate(train_data_set.sample_batches(batch_size=chunk_size)):

            predict_y = neuralnet.predict(train_X)

            output_val = numpy.concatenate((train_y_, predict_y), axis=1)
            predict_file.write("\n".join("\t".join(x) for x in output_val))
            predict_file.write("\n")

        if file_path.startswith("hdfs:"):
                os.system('rm ' + local_file_path)
    predict_file.close()
def train_dnn_for_big_data(config_file):

    config = ConfigParser.ConfigParser()
    config.read(config_file)

    hadoop_bin = config.get("hadoop", 'bin')

    temp_dir = config.get('temp','temp_dir')

    sample_file_list = config.get("input", 'sample_file_list')
    frame_name = config.get("input", 'data_frame_name')

    output_model_prefix = config.get("output", 'output_model_prefix')

    try:
        network_arch = json.loads(config.get("network","architecture"))
    except:
        print config.get("network","architecture")
        raise

    max_epoches = int(config.get("train", 'max_epoches'))
    chunk_size = int(config.get("train", 'chunk_size'))
    optim_settings = json.loads(config.get("train", 'optim_settings'))

    neuralnet = create_neuralnet(network_arch)
    optimizer = create_optimizer(optim_settings)

    optimizer.work_for(neuralnet)

    sample_file_paths = []
    with open(sample_file_list,'r') as f:
        for line in f:
            line = line.strip()
            if line:
                sample_file_paths.append(line)


    for i in range(max_epoches):
        print time.ctime() + ":\tbegin epoche :", i
        shuffle(sample_file_paths)
        for file_path in sample_file_paths:

            if file_path.startswith("hdfs:"):
                local_file_path = download_file(hadoop_bin, file_path, temp_dir)
            else:
                local_file_path = file_path

            train_data_set = SupervisedDataSet(local_file_path, frame_name=frame_name)

            print time.ctime() + ":\tbegin training with sample : " + file_path

            try:

                for idx, (train_X, train_y) in enumerate(train_data_set.sample_batches(batch_size=chunk_size)):

                    print time.ctime() + ":\tbegin new chunk : ", idx, "@epoch : ", i

                    optimizer.update_chunk(train_X, train_y)

                    new_param = optimizer.optimize(neuralnet.get_parameter())

                    neuralnet.set_parameter(new_param)
            except Exception as e:

                print e.message


            if file_path.startswith("hdfs:"):
                os.system('rm ' + local_file_path)


        with open(output_model_prefix + "_"  + str(i) + ".dat", 'w') as f:
            content = network_arch
            content["parameter"] = neuralnet.get_parameter()
            cPickle.dump(content, f, protocol=cPickle.HIGHEST_PROTOCOL)