def main_tree():
    mv_instructions = [
        "mvpar", "mvLeftSibl", "mvRightSibl", "mvFirstChild", "mvLastChild",
        "mvPrevDFS", "mvNextDFS", "mvPrevLeft", "mvNextLeft",
        "mvPrevNodeValue", "mvPrevNodeType", "mvPrevNodeContext"
    ]
    write_instruction = [["wrVal"], ["wrType"]]

    #1. data load and shuffle
    value_vocab, type_vocab, quer_data = gene_queries()

    now = time.time()
    print("1 data load", now - start_time)
    random.shuffle(quer_data)  #shuffle

    split_pos = int(len(quer_data) * 0.7)
    training_data = quer_data[:split_pos]
    test_data = quer_data[split_pos:]

    # 2. make instructions set and shuffle
    feature_num = 3
    print("feature_num", feature_num)
    instructions = gene_feature(mv_instructions, write_instruction,
                                feature_num)
    now = time.time()
    print("2 get feature", now - start_time)
    #random.shuffle(instructions)
    print("data set size", len(training_data), len(test_data))

    # 3. creat tree
    my_tree = create_tree(training_data, instructions)
    now = time.time()
    print("3 get myTree", now - start_time)

    # 4 . save tree
    #print(my_tree)
    import json
    import pickle
    with open('myTree2.pickle', 'wb') as f1:
        pickle.dump(my_tree, f1, protocol=pickle.HIGHEST_PROTOCOL)
    #with open('myTree.json', 'wb') as outf:
    #json.dump(my_tree, outf, ensure_ascii=False)
    now = time.time()
    print("4 save tree", now - start_time)

    #5. get probabilistic model
    model_tree = traverse2model(my_tree)
    now = time.time()
    print("5 model", now - start_time)

    with open('model.pickle', 'wb') as f1:
        pickle.dump(model_tree, f1, protocol=pickle.HIGHEST_PROTOCOL)

    #6 evaluate
    MAP = eval(model_tree, test_data)
    print("MAP:", MAP)
    now = time.time()
    m, s = divmod((now - start_time), 60)
    h, m = divmod(m, 60)
    print("number of classifier", length[0])
    print("6 evaluate time spend%02d:%02d:%02d: " % (h, m, s))
Ejemplo n.º 2
0
def main_tree():
    mv_instructions = [
        "mvpar", "mvLeftSibl", "mvRightSibl", "mvFirstChild", "mvLastChild",
        "mvPrevDFS", "mvNextDFS", "mvPrevLeft", "mvNextLeft",
        "mvPrevNodeValue", "mvPrevNodeType", "mvPrevNodeContext"
    ]
    write_instruction = [["wrVal"], ["wrType"]]

    #1. data load and shuffle
    quer_data = gene_queries()[:300000]
    now = time.time()
    print("1 data load", now - start_time)
    random.shuffle(quer_data)  # shuffle
    trn_data = quer_data[:200000]
    #test_data = quer_data[200000:300000]
    test_data = gene_queries(Test_flag=True)
    test_data = test_data[:100000]

    split_pos = int(len(trn_data) * 0.7)
    training_data = trn_data[:split_pos]
    eval_data = trn_data[split_pos:]
    '''
    print("data len",len(quer_data))
    quer_data =  quer_data[:300000]
    #test data
    test_data = gene_queries(Test_flag=True)
    random.shuffle(test_data)  # shuffle
    test_data = test_data[:100000]

    now = time.time()
    print("1 data load",now- start_time)
    random.shuffle(quer_data) #shuffle

    split_pos = int(len(quer_data) * 0.7)
    training_data = quer_data[:split_pos]
    eval_data= quer_data[split_pos:]
    '''
    print("data set size", len(training_data), len(eval_data), len(test_data))

    # 2. make instructions set and shuffle
    feature_num = 5
    print("feature_num", feature_num)
    instructions = gene_feature(mv_instructions, write_instruction,
                                feature_num)
    now = time.time()
    print("2 get feature", now - start_time)
    #random.shuffle(instructions)

    # 3. creat tree
    my_tree = create_tree(training_data, instructions)
    now = time.time()
    print("3 get myTree", now - start_time)
    #print(my_tree)
    # 4 . save tree
    #print(my_tree)
    import json
    import pickle

    #5. get probabilistic model
    vectorizer = HashingVectorizer(
        n_features=20,
        non_negative=True,
    )
    model_tree = traverse2model(my_tree, instructions, vectorizer)
    now = time.time()
    print("5 model", now - start_time)
    #print("model",model_tree)
    with open('model.pickle', 'wb') as f1:
        pickle.dump(model_tree, f1, protocol=pickle.HIGHEST_PROTOCOL)

    #6 evaluate
    MAP = eval(model_tree, eval_data, vectorizer)
    print("eval MAP:", MAP)
    # 7 test
    MAP_test = eval(model_tree, test_data, vectorizer)
    print("test MAP:", MAP_test)

    now = time.time()
    m, s = divmod((now - start_time), 60)
    h, m = divmod(m, 60)
    print("number of classifier", length[0])
    print("6 evaluate time spend%02d:%02d:%02d: " % (h, m, s))