예제 #1
0
def Doc2vecFeatureEngineering():
    benchmark = 'trainData'
    curpath = os.path.abspath(os.curdir)
    parenDir = os.path.abspath(os.path.pardir)
    path = parenDir + '/' + benchmark + '/'
    print(path)
    #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True)
    train_X = pickleRead('trainData_X')
    train_Y = pickleRead('trainData_Y')
    verify_X = pickleRead('verifyData_X')
    verify_Y = pickleRead('verifyData_Y')
    # train_X=train_X[0:10]   #cut training size for debug
    # train_Y = train_Y[0:10] #cut training size for debug

    # split data to training and verifiying sets
    #train_X, verify_X, train_Y, verify_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=42)

    #load Doc2vec model

    programDoc2VecModel = gensim.models.doc2vec.Doc2Vec.load(
        parenDir + '/models/programDoc2VecModel')
    hintsDoc2VecModel = gensim.models.doc2vec.Doc2Vec.load(
        parenDir + '/models/hintsDoc2VecModel')
    transformDatatoFeatures_doc2vec(train_X, verify_X, programDoc2VecModel,
                                    hintsDoc2VecModel)
    #transformDatatoFeatures_node2vec(train_X, verify_X)
    pickleWrite(train_Y, 'train_Y')
    pickleWrite(verify_Y, 'verify_Y')
예제 #2
0
def main():
    programList = readMultiplePrograms()
    trainData, testData = shuffleData(programList, 0.8)
    trainData_X, trainData_Y = transformDataToTrainingVector(
        pickleRead("argumentTrainData", path="../"))
    testData_X, testData_Y = transformDataToTrainingVector(
        pickleRead("argumentTestData", path="../"))
    pickleWrite(trainData_X, "argumentTrainData_X", path="../")
    pickleWrite(trainData_Y, "argumentTrainData_Y", path="../")
    pickleWrite(testData_X, "argumentTestData_X", path="../")
    pickleWrite(testData_Y, "argumentTestData_Y", path="../")
예제 #3
0
def Node2vecFeatureEngineering():
    benchmark = 'trainData'
    curpath = os.path.abspath(os.curdir)
    parenDir = os.path.abspath(os.path.pardir)
    path = parenDir + '/' + benchmark + '/'
    print(path)
    #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True)
    train_X = pickleRead('trainData_X')
    train_Y = pickleRead('trainData_Y')
    verify_X = pickleRead('verifyData_X')
    verify_Y = pickleRead('verifyData_Y')
    transformDatatoFeatures_node2vec(train_X, verify_X)
    pickleWrite(train_Y, 'train_Y')
    pickleWrite(verify_Y, 'verify_Y')
예제 #4
0
def predict_unseen_set(params, trained_model_path, file_list=[], set_max_nodes_per_batch=True):
    benchmark_fold = params["benchmark"] + "-" + "predict"
    path = "../benchmarks/" + benchmark_fold + "/"
    benchmark_name = path[len("../benchmarks/"):-1]
    parameters = pickleRead(params["benchmark"] + "-" + params["label"] + "-parameters", "../src/trained_model/")
    parameters["benchmark"] = benchmark_name
    print("vocabulary size:", parameters["node_vocab_size"])
    if set_max_nodes_per_batch == True:
        parameters['max_nodes_per_batch'] = params["max_nodes_per_batch"]

    if params["force_read"] == True:
        write_graph_to_pickle(benchmark_name, data_fold=["test"], label=params["label"], path=path,
                              file_type=".smt2", graph_type=params["graph_type"],
                              max_nodes_per_batch=params['max_nodes_per_batch'], vocabulary_name=params["benchmark"],
                              file_list=file_list)
    else:
        print("Use pickle data for training")
    # if form_label == True and not os.path.isfile("../pickleData/" + label + "-" + benchmark_name + "-gnnInput_train_data.txt"):
    if params["form_label"] == True:
        form_GNN_inputs_and_labels(label=params["label"], datafold=["test"], benchmark=benchmark_name,
                                   graph_type=params["graph_type"],
                                   gathered_nodes_binary_classification_task=params[
                                       "gathered_nodes_binary_classification_task"])

    dataset = HornGraphDataset(parameters)
    dataset.load_data([DataFold.TEST])
    test_data = dataset.get_tensorflow_dataset(DataFold.TEST)
    loaded_model = tf2_gnn.cli_utils.model_utils.load_model_for_prediction(trained_model_path, dataset)
    return get_predicted_results(params, loaded_model, test_data)
def predictAndOutputHints(model, programDoc2VecModel, hintsDoc2VecModel,
                          programGraph2VecModel, hintsGraph2VecModel):
    if not os.path.exists("../predictedHints/"):
        os.makedirs("../predictedHints/")
    test_X = pickleRead('testData_X')
    parenDir = os.path.abspath(os.path.pardir)
    path = parenDir + "/predictedHints/"
    #embedding
    encodedPrograms_test, encodedHints_test = doc2vecModelInferNewData(
        test_X, programDoc2VecModel, hintsDoc2VecModel)
    graphEncodedPrograms_test, graphEncodedHints_test = graph2vecModelInferNewData(
        test_X, programGraph2VecModel, hintsGraph2VecModel)

    #predict
    sigmoidOutput = model.predict([
        encodedPrograms_test, encodedHints_test, graphEncodedPrograms_test,
        graphEncodedHints_test
    ])
    #transform probability to binary classification
    predicted_y = sigmoidOutput.copy()
    predicted_y[predicted_y > 0.5] = int(1)  # convert decimals to 0 and 1
    predicted_y[predicted_y <= 0.5] = int(0)  # convert decimals to 0 and 1

    print("Show one example")
    print("test_X[0][0]", test_X[0][0])  # program
    print("test_X[0][1]", test_X[0][1])  # hint text (head \n hint)
    print("test_X[0][2]", test_X[0][2])  # progran graph embedding
    print("test_X[0][3]", test_X[0][3])  # hint graph
    print("test_X[0][4]", test_X[0][4])  # hint ID
    print("test_X len:", len(test_X))
    print(predicted_y[0])

    #write results to file
    fileList = list()
    for X in test_X:
        fileList.append(X[5])
    fileList = list(set(fileList))
    for fileName in fileList:
        predictedHintListWithID = list()
        print(fileName)
        f = open(path + fileName + ".optimizedHints", "w+")

        #print("sorted")
        for X, y, score in sorted(zip(test_X, predicted_y, sigmoidOutput),
                                  key=lambda t: t[2],
                                  reverse=True):
            #print(X[4],X[1],y,score)
            if (X[5] == fileName):
                #predictedHintListWithID.append([X[4],X[1], X[1],y,score])
                # ID,head,hint,predicted result,score
                head = X[1][:X[1].find("\n")]
                #head=head[:head.find("/")]
                hint = X[1][X[1].find("\n") + 1:]
                predictedHintListWithID.append([X[4], head, hint, y, score])
                content = X[4] + ":" + head + ":" + hint + ":" + "".join(
                    map(str, np.around(y, 0))) + ":" + "".join(map(
                        str, score)) + "\n"
                f.write(content)
        f.close()
예제 #6
0
def Graph2vecFeatureEngineering():
    benchmark = 'trainData'
    curpath = os.path.abspath(os.curdir)
    parenDir = os.path.abspath(os.path.pardir)
    path = parenDir + '/' + benchmark + '/'
    print(path)
    #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True)
    train_X = pickleRead('trainData_X')
    train_Y = pickleRead('trainData_Y')
    verify_X = pickleRead('verifyData_X')
    verify_Y = pickleRead('verifyData_Y')
    programGraph2VecModel = gensim.models.doc2vec.Doc2Vec.load(
        parenDir + '/models/programGraph2VecModel')
    hintsGraph2VecModel = gensim.models.doc2vec.Doc2Vec.load(
        parenDir + '/models/hintsGraph2VecModel')
    transformDatatoFeatures_graph2vec(train_X, verify_X, programGraph2VecModel,
                                      hintsGraph2VecModel)
    pickleWrite(train_Y, 'train_Y')
    pickleWrite(verify_Y, 'verify_Y')
def trainDoc2VecModelfunction(program_dim=100, hint_dim=20):
    X_train = pickleRead('trainData_X')

    # extract programs and hints from dataset
    programs_train, hints_train, graphProgram_train, graphHint_train = data2list(
        X_train)
    programs_train = list(set(programs_train))

    # transform to TaggedDocument
    programs_trainTaggedDocument, programsMaxLength, programsAverageLength = transform2TaggedDocument(
        programs_train)
    hints_trainTaggedDocument, hintsMaxLength, hintsAverageLength = transform2TaggedDocument(
        hints_train)
    # print('programsMaxLength',programsMaxLength)
    # print('programsAverageLength',programsAverageLength)
    # print('hintsMaxLength',hintsMaxLength)
    # print('hintsAverageLength',hintsAverageLength)

    # create Doc2Vec model
    # parameters window=2
    programDoc2VecModel = gensim.models.doc2vec.Doc2Vec(
        vector_size=program_dim,
        min_count=0,
        window=programsAverageLength,
        epochs=50)
    hintsDoc2VecModel = gensim.models.doc2vec.Doc2Vec(vector_size=hint_dim,
                                                      min_count=0,
                                                      window=hintsMaxLength,
                                                      epochs=50)

    # build vovabulary
    programDoc2VecModel.build_vocab(programs_trainTaggedDocument)
    hintsDoc2VecModel.build_vocab(hints_trainTaggedDocument)
    # train Doc2Vec model
    programDoc2VecModel.train(programs_trainTaggedDocument,
                              total_examples=programDoc2VecModel.corpus_count,
                              epochs=programDoc2VecModel.epochs)
    hintsDoc2VecModel.train(hints_trainTaggedDocument,
                            total_examples=hintsDoc2VecModel.corpus_count,
                            epochs=hintsDoc2VecModel.epochs)
    # save trained doc2vec models
    parenDir = os.path.abspath(os.path.pardir)
    programDoc2VecModel.save(parenDir + '/models/programDoc2VecModel')
    hintsDoc2VecModel.save(parenDir + '/models/hintsDoc2VecModel')
    return programDoc2VecModel, hintsDoc2VecModel
def main():
    print("Start")

    #benchmark='dillig'
    benchmark = 'trainData'
    curpath = os.path.abspath(os.curdir)
    parenDir = os.path.abspath(os.path.pardir)
    path = parenDir + '/' + benchmark + '/'
    print(path)

    #load Doc2Vec models
    #programDoc2VecModel=gensim.models.doc2vec.Doc2Vec.load(parenDir+'/models/programDoc2VecModel')
    #hintsDoc2VecModel=gensim.models.doc2vec.Doc2Vec.load(parenDir+'/models/hintsDoc2VecModel')

    #load features
    encodedPrograms_train = pickleRead('encodedPrograms_train')
    encodedPrograms_test = pickleRead('encodedPrograms_test')
    encodedHints_train = pickleRead('encodedHints_train')
    encodedHints_test = pickleRead('encodedHints_test')

    graphEncodedPrograms_train = pickleRead('graphEncodedPrograms_train')
    graphEncodedPrograms_test = pickleRead('graphEncodedPrograms_test')

    train_Y = pickleRead('train_Y')
    verify_Y = pickleRead('verify_Y')

    #train
    batch_size = int(encodedPrograms_train.shape[0] / 100)
    epochs = 100
    #history,model=train(encodedPrograms_train,encodedPrograms_test,encodedHints_train,encodedHints_test,train_Y, verify_Y,batch_size,epochs)
    history, model = train(encodedPrograms_train, encodedPrograms_test,
                           graphEncodedPrograms_train,
                           graphEncodedPrograms_test, encodedHints_train,
                           encodedHints_test, train_Y, verify_Y, batch_size,
                           epochs)
    plotHistory(history)
def main():
    print("Start")
    #remove files in testData, pickleData, and models
    if(os.path.exists("../testData")):
        shutil.rmtree("../testData/")
        shutil.rmtree("../pickleData/")
        shutil.rmtree("../models/")
        os.mkdir("../testData")
        os.mkdir("../pickleData")
        os.mkdir("../models")
    benchmark = 'trainData'
    curpath = os.path.abspath(os.curdir)
    parenDir = os.path.abspath(os.path.pardir)
    path = parenDir + '/' + benchmark + '/'
    print(path)

    # get graph data
    #callEldaricaGenerateGraphs('trainData')

    # transformOneFiletoFeatures(path)
    train_X ,train_Y ,verify_X ,verify_Y =\
        readHornClausesAndHints_resplitTrainAndVerifyData(path ,\
        dataset='train',discardNegativeData=False,smallTrain=False,smallTrainSize=50,\
                                                          trainDataSplitRate=0.8)
    # train_X=pickleRead('trainData_X')
    # train_Y = pickleRead('trainData_Y')
    # verify_X = pickleRead('verifyData_X')
    # verify_X = pickleRead('verifyData_Y')
    #train_X=train_X[0:40]   #cut training size for debug
    #train_Y = train_Y[0:40] #cut training size for debug

    #train and save Doc2Vec models
    print("train Doc2Vec model (text) begin")
    trainDoc2VecModelfunction(program_dim=100,hint_dim=20)
    print("train Doc2Vec model (text) end")
    print("train Doc2Vec model (graph) begin")
    trainGraph2VecModelfunction(program_dim=100,hint_dim=20)
    print("train Doc2Vec model (graph) end")
    # load Doc2Vec models
    #programDoc2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/programDoc2VecModel')
    #hintsDoc2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/hintsDoc2VecModel')
    #programGraph2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/programGraph2VecModel')
    #hintsGraph2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/hintsGraph2VecModel')

    # split data to training and verifiying sets
    # train_X, verify_X, train_Y, verify_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=42)

    # checkSplitData(X_train, X_test, y_train, y_test)

    # feature engineering
    # encodedPrograms_train,encodedPrograms_test,encodedHints_train,encodedHints_test=transformDatatoFeatures_tokennizer(train_X,verify_X)
    # encodedPrograms_train,encodedPrograms_test,encodedHints_train,encodedHints_test,\
    #     =transformDatatoFeatures_doc2vec(train_X, verify_X,programDoc2VecModel,hintsDoc2VecModel)
    from Data2Features import Doc2vecFeatureEngineering,Node2vecFeatureEngineering,Graph2vecFeatureEngineering
    Doc2vecFeatureEngineering()
    Graph2vecFeatureEngineering()
    #Node2vecFeatureEngineering()

    # load features
    encodedPrograms_train = pickleRead('encodedPrograms_train')
    encodedPrograms_test = pickleRead('encodedPrograms_verify')

    graphEncodedPrograms_train = pickleRead('graphEncodedPrograms_train')
    graphEncodedPrograms_test = pickleRead('graphEncodedPrograms_verify')

    encodedHints_train = pickleRead('encodedHints_train')
    encodedHints_test = pickleRead('encodedHints_verify')

    graphencodedHints_train = pickleRead('graphEncodedHints_train')
    graphencodedHints_test = pickleRead('graphEncodedHints_verify')

    train_Y = pickleRead('train_Y')
    verify_Y = pickleRead('verify_Y')

    # train
    batch_size = round(encodedPrograms_train.shape[0] / 100)
    if(batch_size<1):
        batch_size=1
    epochs = 100
    # #without graph
    # history, model = train2(encodedPrograms_train, encodedPrograms_test,\
    #                         encodedHints_train, encodedHints_test, train_Y,\
    #                         verify_Y, batch_size, epochs)
    # #with program graph
    # history, model = train3(encodedPrograms_train, encodedPrograms_test,\
    #                        graphEncodedPrograms_train,graphEncodedPrograms_test,\
    #                        encodedHints_train, encodedHints_test,\
    #                        train_Y,verify_Y, batch_size, epochs)
    #with program graph and hint graph
    history, model = train4(encodedPrograms_train, encodedPrograms_test,\
                           graphEncodedPrograms_train,graphEncodedPrograms_test,\
                           encodedHints_train, encodedHints_test,\
                           graphencodedHints_train,graphencodedHints_test,\
                           train_Y,verify_Y, batch_size, epochs)
예제 #10
0
def wrapped_prediction(
        trained_model_path,
        benchmark,
        benchmark_fold,
        label="template_relevance",
        force_read=True,
        form_label=True,
        json_type=".hyperEdgeHornGraph.JSON",
        graph_type="hyperEdgeHornGraph",
        gathered_nodes_binary_classification_task=["template_relevance"],
        hyper_parameter={},
        set_max_nodes_per_batch=False,
        file_list=[]):

    path = "../benchmarks/" + benchmark_fold + "/"
    benchmark_name = path[len("../benchmarks/"):-1]
    parameters = pickleRead(benchmark + "-" + label + "-parameters",
                            "../src/trained_model/")
    parameters["benchmark"] = benchmark_name
    print("vocabulary size:", parameters["node_vocab_size"])
    if set_max_nodes_per_batch == True:
        parameters['max_nodes_per_batch'] = hyper_parameter[
            "max_nodes_per_batch"]

    if force_read == True:
        write_graph_to_pickle(
            benchmark_name,
            data_fold=["test"],
            label=label,
            path=path,
            file_type=".smt2",
            graph_type=graph_type,
            max_nodes_per_batch=parameters['max_nodes_per_batch'],
            vocabulary_name=benchmark,
            file_list=file_list)
    else:
        print("Use pickle data for training")
    # if form_label == True and not os.path.isfile("../pickleData/" + label + "-" + benchmark_name + "-gnnInput_train_data.txt"):
    if form_label == True:
        form_GNN_inputs_and_labels(label=label,
                                   datafold=["test"],
                                   benchmark=benchmark_name,
                                   graph_type=graph_type,
                                   gathered_nodes_binary_classification_task=
                                   gathered_nodes_binary_classification_task)

    quiet = False
    dataset = HornGraphDataset(parameters)
    dataset.load_data([DataFold.TEST])
    test_data = dataset.get_tensorflow_dataset(DataFold.TEST)
    loaded_model = tf2_gnn.cli_utils.model_utils.load_model_for_prediction(
        trained_model_path, dataset)
    _, _, test_results = loaded_model.run_one_epoch(test_data,
                                                    training=False,
                                                    quiet=quiet)
    test_metric, test_metric_string = loaded_model.compute_epoch_metrics(
        test_results)
    predicted_Y_loaded_model = loaded_model.predict(test_data)

    print("test_metric_string", test_metric_string)
    print("test_metric", test_metric)

    # test measurement
    true_Y = []
    true_Y_by_file = []
    true_Y_file_list = []
    for data in iter(test_data):
        true_Y.extend(np.array(data[1]["node_labels"]))
    for data in dataset._label_list["test"]:
        true_Y_by_file.append(np.array(data))
    for file_name in dataset._file_list["test"]:
        true_Y_file_list.append(file_name)

    mse_loaded_model = tf.keras.losses.MSE(true_Y, predicted_Y_loaded_model)
    print("\n mse_loaded_model_predicted_Y_and_True_Y", mse_loaded_model)

    mse_mean = tf.keras.losses.MSE([np.mean(true_Y)] * len(true_Y), true_Y)
    print("\n mse_mean_Y_and_True_Y", mse_mean)
    best_set_threshold = (
        lambda: hyper_parameter["best_threshold_set"]
        if hyper_parameter["read_best_threshold"] else
        write_best_threshod_to_pickle(
            parameters, true_Y, predicted_Y_loaded_model, label, benchmark))()
    best_set_ranks = (lambda: {
        "top_percentage": 0,
        "accuracy": 0
    } if hyper_parameter[
        "read_best_threshold"] else wrapped_set_threshold_by_ranks(
            true_Y, true_Y_by_file, predicted_Y_loaded_model, true_Y_file_list)
                      )()

    print("----------", benchmark_fold, "-----", label, "----------")
    print(hyper_parameter)
    positive_label_number = sum(true_Y)
    negative_label_number = len(true_Y) - positive_label_number

    print("best_set_threshold", best_set_threshold)
    print("positive_label_percentage", positive_label_number / len(true_Y))
    print("negative_label_number", negative_label_number / len(true_Y))
    print("best_set_threshold", "threshold value:",
          best_set_threshold["threshold"], "accuracy:",
          best_set_threshold["accuracy"])
    print("best_set_ranks", "top_percentage:",
          best_set_ranks["top_percentage"], "accuracy:",
          best_set_ranks["accuracy"])

    random_guess_accuracy = max(positive_label_number / len(true_Y),
                                negative_label_number / len(true_Y))
    print(
        "{0:.2%}".format(
            max(best_set_threshold["accuracy"], best_set_ranks["accuracy"]) -
            random_guess_accuracy), "better than random guess")
    return {
        "trained_model_path": trained_model_path,
        "best_set_threshold": best_set_threshold["accuracy"],
        "best_set_ranks": best_set_ranks["accuracy"],
        "benchmark_fold": benchmark_fold,
        "label": label,
        "hyper_parameter": hyper_parameter,
        "positive_label_percentage": positive_label_number / len(true_Y),
        "negative_label_number": negative_label_number / len(true_Y),
        "dataset": dataset,
        "predicted_Y_loaded_model": predicted_Y_loaded_model,
        "best_threshold": best_set_threshold["threshold"]
    }