Ejemplo n.º 1
0
def show_authors(model, basefolder=None):
    dataset_folder = "dataset/cpp"
    trees, tree_labels, lable_problems, features = parse_src_files(
        dataset_folder)
    # trees, tree_labels = pick_subsets(trees, tree_labels, labels=2)
    classes_, y = np.unique(tree_labels, return_inverse=True)
    trees_X, _ = evaluate(model, trees, y)

    # estimator = DBSCAN(eps=0.3, min_samples=10)
    print("\n")
    print("*" * 10, " Cluster Authors:", "*" * 10)
    estimator = KMeans(n_clusters=5, init='k-means++')

    X_avg = []
    y_avg = []
    label_avg = []
    for label in np.unique(tree_labels):
        X_avg.append(trees_X[tree_labels == label].mean(axis=0))
        y_avg.append(label)
        label_avg.append(classes_[np.argwhere(classes_ == label)[0][0]])

    X_avg = np.vstack((X_avg))
    y_avg = np.array(y_avg)
    label_avg = np.array(label_avg)

    X_scale = scale(X_avg)
    cluster_table(estimator, trees_X, y, tree_labels)
    cluster_plot(estimator, X_scale, y_avg, label_avg, basefolder=basefolder)

    estimator = NearestNeighbors(n_neighbors=5)
    print("*" * 10, " Neighbors of Authors:", "*" * 10)
    neighbors_table(estimator, trees_X, y, tree_labels)
Ejemplo n.º 2
0
def main(pipline):
    basefolder = "dataset\cpp"
    X, y, tags,features = parse_src_files(basefolder)

    print("%s problems, %s users :" % (len(set(tags)), len(set(y))))

    folds = StratifiedKFold(n_splits=10)
    accuracy = []
    import_features = defaultdict(int)
    features = []
    for idx, (train, test) in enumerate(folds.split(X,y)):
        pipline.fit(X[train], y[train])
        y_predict = pipline.predict(X[test])
        accuracy.append(accuracy_score(y[test], y_predict))

        extract = pipline.steps[0][1]
        select = pipline.steps[1][1]
        rf = pipline.steps[2][1]

        print("accuracy = ", accuracy[-1])
        non_zero_features = np.nonzero(rf)[0]
        print("zero features =", len(rf.feature_importances_) - len(non_zero_features))
        print("Non zero features =", len(non_zero_features))
        for feature in np.nonzero(rf.feature_importances_)[0]:
            import_features[feature] += 1
        for f in select.important_indices:
            features.append(extract.features_categories[f])

    print("features categories =", [(k, v / float(len(features)) * 100.0) for k, v in Counter(features).most_common()])
    print("AVG =", np.mean(accuracy))
Ejemplo n.º 3
0
def test_all():
    basefolder = get_basefolder()
    X, y, tags = parse_src_files(basefolder)
    try:
        ast_tree = ASTVectorizer(ngram=2, normalize=True, idf=True, norm="l2")
        ast_tree.fit(X, y)
    except:
        print(traceback.format_exc())
Ejemplo n.º 4
0
def main_gridsearch():
    basefolder = get_basefolder()
    X, y, tags = parse_src_files(basefolder)

    print("%s problems, %s users :" % (len(set(tags)), len(set(y))))
    pipline = Pipeline([
        ('ast', ASTVectorizer(DotNodes(),normalize=True, idf=True, dtype=np.float32)),
        ('select', TopRandomTreesEmbedding()),  # PredefinedFeatureSelection()),
        ('clf', RandomForestClassifier())])

    folds = StratifiedKFold(n_splits=5)
    parameters = {
        'ast__ngram': (2,),
        # 'ast__v_skip': (1, 2),

        'select__k': (1000, 1200, 1500, 2000),
        'select__n_estimators': (1000, 1500),
        'select__max_depth': (40, 60),

        'clf__n_estimators': (1000, 1500),
        'clf__min_samples_split': (1,),
    }

    grid_search = GridSearchCV(estimator=pipline, param_grid=parameters, cv=folds, n_jobs=5, verbose=10)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    report(grid_search.grid_scores_)
Ejemplo n.º 5
0
def main_experiment():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train',
                        '-t',
                        type=str,
                        default="",
                        help='Experiment Training data info')

    parser.add_argument('--name',
                        '-n',
                        type=str,
                        default="default_experiment",
                        help='Experiment name')
    parser.add_argument('--dataset',
                        '-d',
                        type=str,
                        default="cpp",
                        help='Experiment dataset')
    parser.add_argument('--classes',
                        '-c',
                        type=int,
                        default=-1,
                        help='How many classes to include in this experiment')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--folder',
                        '-f',
                        type=str,
                        default="",
                        help='Base folder for logs and results')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=1,
                        help='Number of examples in each mini batch')
    parser.add_argument('--layers',
                        '-l',
                        type=int,
                        default=1,
                        help='Number of Layers for LSTMs')
    parser.add_argument('--dropout',
                        '-dr',
                        type=float,
                        default=0.2,
                        help='Number of Layers for LSTMs')
    parser.add_argument('--iterations',
                        '-i',
                        type=int,
                        default=0,
                        help='CV iterations')
    parser.add_argument('--seperate',
                        '-st',
                        action='store_true',
                        default=False,
                        help='Save best models')

    parser.add_argument('--model',
                        '-m',
                        type=str,
                        default="bilstm",
                        help='Model used for this experiment')
    parser.add_argument('--units',
                        '-u',
                        type=int,
                        default=100,
                        help='Number of hidden units')
    parser.add_argument('--cell',
                        '-cl',
                        type=str,
                        default="lstm",
                        help='peeplstm')
    parser.add_argument('--residual',
                        '-r',
                        action='store_true',
                        default=False,
                        help='Number of examples in each mini batch')
    parser.add_argument('--save',
                        '-s',
                        type=int,
                        default=1,
                        help='Save best models')

    args = parser.parse_args()

    n_epoch = 500
    n_units = args.units
    batch_size = args.batchsize
    gpu = args.gpu
    models_base_folder = "saved_models"
    output_folder = os.path.join(
        "results", args.folder
    )  # args.folder  #R"C:\Users\bms\PycharmProjects\stylemotery_code" #
    exper_name = args.name
    dataset_folder = os.path.join("dataset", args.dataset)
    seperate_trees = args.seperate
    model_name = args.model
    layers = args.layers
    dropout = args.dropout
    cell = args.cell
    residual = args.residual

    trees, tree_labels, lable_problems, tree_nodes = parse_src_files(
        dataset_folder, seperate_trees=seperate_trees)
    if args.train:
        rand_seed, classes = read_train_config(
            os.path.join("train", args.dataset, args.train))
        trees, tree_labels = pick_subsets(trees, tree_labels, classes=classes)
    else:
        rand_seed = random.randint(0, 4294967295)
        if args.classes > -1:
            trees, tree_labels = pick_subsets(trees,
                                              tree_labels,
                                              labels=args.classes,
                                              seed=rand_seed,
                                              classes=None)

    if model_name in ("treelstm", "slstm"):
        trees = make_binary_tree(unified_ast_trees(trees), layers)

    # trees, tree_labels, lable_problems = generate_trees(labels=2,children=5,examples_per_label=10)
    # tree_nodes = AstNodes()

    train_trees, train_lables, test_trees, test_lables, classes, cv = split_trees(
        trees,
        tree_labels,
        n_folds=5,
        shuffle=True,
        seed=rand_seed,
        iterations=args.iterations)

    output_file = open(os.path.join(output_folder,
                                    exper_name + "_results.txt"),
                       mode="+w")
    output_file.write("Testing the model on all the datasets\n")
    output_file.write("Args :- " + str(args) + "\n")
    output_file.write("Seed :- " + str(rand_seed) + "\n")

    output_file.write("Classes :- (%s)\n" % [(idx, c)
                                             for idx, c in enumerate(classes)])
    output_file.write("Class ratio :- %s\n" % list(
        sorted([(t, c, c / len(tree_labels))
                for t, c in collections.Counter(tree_labels).items()],
               key=itemgetter(0),
               reverse=False)))
    output_file.write("Cross Validation :-%s\n" % cv)
    output_file.write(
        "Train labels :- (%s,%s%%): %s\n" %
        (len(train_lables),
         (len(train_lables) / len(tree_labels)) * 100, train_lables))
    output_file.write(
        "Test  labels :- (%s,%s%%): %s\n" %
        (len(test_lables),
         (len(test_lables) / len(tree_labels)) * 100, test_lables))

    if model_name == "lstm":
        model = RecursiveLSTM(n_units,
                              len(classes),
                              layers=layers,
                              dropout=dropout,
                              feature_dict=tree_nodes,
                              classes=classes,
                              cell=cell,
                              residual=residual)
    elif model_name == "bilstm":
        model = RecursiveBiLSTM(n_units,
                                len(classes),
                                layers=layers,
                                dropout=dropout,
                                feature_dict=tree_nodes,
                                classes=classes,
                                cell=cell,
                                residual=residual)
    elif model_name == "treelstm":
        model = RecursiveTreeLSTM(n_children=layers,
                                  n_units=n_units,
                                  n_label=len(classes),
                                  dropout=dropout,
                                  feature_dict=tree_nodes,
                                  classes=classes)
    else:
        print("No model was found")
        return

    output_file.write("Model:  {0}\n".format(exper_name))
    output_file.write("Params: {:,} \n".format(model.params_count()))
    output_file.write("        {0} \n".format(type(model).__name__))
    print_model(model, depth=1, output=output_file)

    if gpu >= 0:
        model.to_gpu()

    # Setup optimizer
    optimizer = optimizers.SGD(
        lr=0.01
    )  #Adam(alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-08)#AdaGrad(lr=0.01)#NesterovAG(lr=0.01, momentum=0.9)#AdaGrad(lr=0.01) # MomentumSGD(lr=0.01, momentum=0.9)  # AdaGrad(lr=0.1) #
    output_file.write("Optimizer: {0} ".format(
        (type(optimizer).__name__, optimizer.__dict__)))
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.001))
    optimizer.add_hook(chainer.optimizer.GradientClipping(5.0))

    hooks = [(k, v.__dict__) for k, v in optimizer._hooks.items()]
    output_file.write(" {0} \n".format(hooks))

    output_file.write("Evaluation\n")
    output_file.write("{0:<10}{1:<20}{2:<20}{3:<20}{4:<20}{5:<20}\n".format(
        "epoch", "learing_rate", "train_loss", "test_loss", "train_accuracy",
        "test_accuracy"))
    output_file.flush()

    def drop_decay(epoch, initial_lrate=0.1, drop=0.5, epochs_drop=10.0):
        lrate = initial_lrate * math.pow(drop,
                                         math.floor((1 + epoch) / epochs_drop))
        return lrate

    def time_decay(epoch, initial_lrate=0.1, decay=0.1):
        return initial_lrate * 1 / (1 + decay * epoch)

    def range_decay(epoch):
        class RangeDictionary(dict):
            def __getitem__(self, key):
                for r in self.keys():
                    if key in r:
                        return super().__getitem__(r)
                return super().__getitem__(key)

        rates = {
            range(0, 50): 0.01,
            range(50, 100): 0.005,
            range(100, 150): 0.001,
            range(200, 300): 0.0005,
            range(300, 500): 0.0001
        }
        return RangeDictionary(rates)[epoch]

    best_scores = (-1, -1, -1)  # (epoch, loss, accuracy)
    for epoch in range(1, n_epoch + 1):
        optimizer.lr = range_decay(epoch - 1)
        print('Epoch: {0:d} / {1:d}'.format(epoch, n_epoch))
        print("optimizer lr = ", optimizer.lr)
        print('Train')
        training_accuracy, training_loss = train(model,
                                                 train_trees,
                                                 train_lables,
                                                 optimizer,
                                                 batch_size,
                                                 shuffle=True)
        print('Test')
        test_accuracy, test_loss = evaluate(model, test_trees, test_lables,
                                            batch_size)
        print()

        # if epoch % 5 == 0:
        #     model.curr_layers += 1
        # save the best models
        saved = False
        if args.save > 0 and epoch > 0:
            epoch_, loss_, acc_ = best_scores
            # save the model with best accuracy or same accuracy and less loss
            if test_accuracy > acc_ or (test_accuracy >= acc_
                                        and test_loss <= loss_):
                # remove last saved model
                # remove_old_model(models_base_folder,exper_name, epoch_)
                # save models
                save_new_model(model, optimizer, models_base_folder,
                               exper_name, epoch)
                saved = True
                print("saving ... ")
                best_scores = (epoch, test_loss, test_accuracy)

        output_file.write(
            "{0:<10}{1:<20.10f}{2:<20.10f}{3:<20.10f}{4:<20.10f}{5:<20.10f}{6:<10}\n"
            .format(epoch, optimizer.lr, training_loss, test_loss,
                    training_accuracy, test_accuracy,
                    "saved" if saved else ""))
        output_file.flush()

        if epoch >= 5 and (test_loss < 0.001 or test_accuracy >= 1.0):
            output_file.write("\tEarly Stopping\n")
            print("\tEarly Stopping")
            break

            # if epoch is 3:
            #     lr = optimizer.lr
            #     setattr(optimizer, 'lr', lr / 10)

    output_file.close()
Ejemplo n.º 6
0
    X = X[bool_vec]
    y = y[bool_vec]
    print("After:")
    print("Class ratio :- %s\n" % list(
        sorted([(t, c, "%.2f" % (c / len(y))) for t, c in collections.Counter(y).items()], key=itemgetter(0),
               reverse=False)))
    depths = np.array([max_depth(x) for x in X])
    branches = np.array([max_branch(x) for x in X])
    plot_dists("Single Tree", depths, branches, max_len=10)



if __name__ == "__main__":
    dataset = "cpp"
    # train = "70_authors.labels1.txt"
    X, y, tags,features = parse_src_files(os.path.join("dataset",dataset),seperate_trees=False,verbose=0)
    # rand_seed, classes = read_train_config(os.path.join("train", dataset, train))
    # X, y = pick_subsets(X, y, classes=classes)
    # for file in os.listdir(os.path.join("train","cpp")):
    #     print(file)
    #     rand_seed, classes = read_config(os.path.join("train","cpp",file))
    #     X, y = pick_subsets(X_e, y_e, classes=classes)
    #     print("Class ratio :- %s" % list(sorted([(t, c, c / len(y)) for t, c in collections.Counter(y).items()], key=itemgetter(0),reverse=False)))
    #     print()
    # X = make_binary_tree(unified_ast_trees(X), 9)
    depths = np.array([max_depth(x) for x in X])
    branches = np.array([max_branch(x) for x in X])

    print(np.mean(depths))
    print(np.mean(branches))
    # plot_dists("Python Average", depths, branches, max_len=200,base_folder=R"C:\Users\bms\Files\current\research\stylemotry\stylometry papers\usenix\img")#,base_folder=R"C:\Users\bms\Files\current\research\stylemotry\stylemotery_code\dataset\analysis")
Ejemplo n.º 7
0
def main_experiment():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config',
                        '-c',
                        type=str,
                        default="",
                        help='Configuration file')

    args = parser.parse_args()
    if args.config == "":
        parser.print_help()
        return
    args, seed, classes, last_epoch = read_config(args.config)

    n_epoch = 500
    n_units = args.units
    batch_size = args.batchsize
    gpu = args.gpu
    models_base_folder = "saved_models"
    output_folder = os.path.join(
        "results", args.folder
    )  # args.folder  #R"C:\Users\bms\PycharmProjects\stylemotery_code" #
    exper_name = args.name
    dataset_folder = os.path.join("dataset", args.dataset)
    seperate_trees = args.seperate
    model_name = args.model
    layers = args.layers
    dropout = args.dropout
    cell = args.cell
    residual = args.residual

    output_file = open(os.path.join(output_folder,
                                    exper_name + "_results.txt"),
                       mode="a")

    trees, tree_labels, lable_problems, tree_nodes = parse_src_files(
        dataset_folder, seperate_trees=seperate_trees)
    if model_name == "lstm":
        model = RecursiveLSTM(n_units,
                              len(classes),
                              layers=layers,
                              dropout=dropout,
                              feature_dict=tree_nodes,
                              classes=classes,
                              cell=cell,
                              residual=residual)
    elif model_name == "bilstm":
        model = RecursiveBiLSTM(n_units,
                                len(classes),
                                layers=layers,
                                dropout=dropout,
                                feature_dict=tree_nodes,
                                classes=classes,
                                cell=cell,
                                residual=residual)
    elif model_name == "treelstm":
        model = RecursiveTreeLSTM(n_children=layers,
                                  n_units=n_units,
                                  n_label=len(classes),
                                  dropout=dropout,
                                  feature_dict=tree_nodes,
                                  classes=classes)
    else:
        print("No model was found")
        return

    # load the model
    model_saved_name = "{0}_epoch_".format(exper_name)
    saved_models = [
        m for m in os.listdir(models_base_folder)
        if m.startswith(model_saved_name) and m.endswith(".my")
    ]
    if len(saved_models) > 0:
        #pick the best one
        model_saved_name = list(
            sorted(saved_models,
                   key=lambda name: int(name.split(".")[0].split("_")[-1]),
                   reverse=True))[0]
    else:
        print("No model was found to load")
        return
    path = os.path.join(models_base_folder, model_saved_name)
    serializers.load_npz(path, model)

    if gpu >= 0:
        model.to_gpu()

    # Setup optimizer
    optimizer = optimizers.MomentumSGD(
        lr=0.01, momentum=0.9
    )  #Adam(alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-08)#AdaGrad(lr=0.01)#NesterovAG(lr=0.01, momentum=0.9)#AdaGrad(lr=0.01) # MomentumSGD(lr=0.01, momentum=0.9)  # AdaGrad(lr=0.1) #
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.001))
    optimizer.add_hook(chainer.optimizer.GradientClipping(10.0))

    # load the optimizor
    opt_saved_name = "{0}_epoch_".format(exper_name)
    saved_opts = [
        m for m in os.listdir(models_base_folder)
        if m.startswith(opt_saved_name) and m.endswith(".opt")
    ]
    if len(saved_opts) > 0:
        #pick the best one
        opt_saved_name = list(
            sorted(saved_opts,
                   key=lambda name: int(name.split(".")[0].split("_")[-1]),
                   reverse=True))[0]
    else:
        print("No model was found to load")
        return
    path = os.path.join(models_base_folder, opt_saved_name)
    serializers.load_npz(path, optimizer)

    trees, tree_labels = pick_subsets(trees, tree_labels, classes=classes)
    train_trees, train_lables, test_trees, test_lables, classes, cv = split_trees(
        trees,
        tree_labels,
        n_folds=5,
        shuffle=True,
        seed=seed,
        iterations=args.iterations)
    best_scores = (-1, -1, -1)  # (epoch, loss, accuracy)

    for epoch in range(last_epoch, n_epoch + 1):
        print('Epoch: {0:d} / {1:d}'.format(epoch, n_epoch))
        print("optimizer lr = ", optimizer.lr)
        print('Train')
        training_accuracy, training_loss = train(model,
                                                 train_trees,
                                                 train_lables,
                                                 optimizer,
                                                 batch_size,
                                                 shuffle=True)
        print('Test')
        test_accuracy, test_loss = evaluate(model, test_trees, test_lables,
                                            batch_size)
        print()

        # save the best models
        saved = False
        if args.save > 0 and epoch > 0:
            epoch_, loss_, acc_ = best_scores
            # save the model with best accuracy or same accuracy and less loss
            if test_accuracy > acc_ or (test_accuracy >= acc_
                                        and test_loss <= loss_):
                # remove last saved model
                remove_old_model(models_base_folder, exper_name, epoch_)
                # save models
                save_new_model(model, optimizer, models_base_folder,
                               exper_name, epoch)
                saved = True
                print("saving ... ")
                best_scores = (epoch, test_loss, test_accuracy)

        output_file.write(
            "{0:<10}{1:<20.10f}{2:<20.10f}{3:<20.10f}{4:<20.10f}{5:<20.10f}{6:<10}\n"
            .format(epoch, optimizer.lr, training_loss, test_loss,
                    training_accuracy, test_accuracy,
                    "saved" if saved else ""))
        output_file.flush()

        if epoch >= 5 and (test_loss < 0.001 or test_accuracy >= 1.0):
            output_file.write("\tEarly Stopping\n")
            print("\tEarly Stopping")
            break

            # if epoch is 3:
            #     lr = optimizer.lr
            #     setattr(optimizer, 'lr', lr / 10)

    output_file.close()
Ejemplo n.º 8
0
    #     ("RF_250_sep_15_labels1", "15_authors.labels1.txt"),
    #     ("RF_250_sep_15_labels2", "15_authors.labels2.txt"),
    #     ("RF_250_sep_15_labels3", "15_authors.labels3.txt"),
    #     ("RF_250_sep_15_labels4", "15_authors.labels4.txt"),
    #     ("RF_250_sep_15_labels5", "15_authors.labels5.txt")
    # ]

    # train_labels = [
    #     ("RF_500_70_labels1", "70_authors.labels1.txt"),
    # ]
    args = parser.parse_args()
    n_folds = args.folds
    exper_name = args.name
    output_folder = os.path.join("results",args.folder)  # args.folder  #R"C:\Users\bms\PycharmProjects\stylemotery_code" #
    dataset_folder = os.path.join("dataset", args.dataset)
    trees, tree_labels, lable_problems, features = parse_src_files(dataset_folder,seperate_trees=False)
    #print(len(trees))
    pipline = Pipeline([
        ('astvector', ASTVectorizer(features, ngram=2, v_skip=0, normalize=True, idf=True, dtype=np.float32)),
        ('selection', TopRandomTreesEmbedding(k=1000, n_estimators=1500, max_depth=20)),
        # PredefinedFeatureSelection()),
        # ('randforest',LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr'))])
        ('randforest',RandomForestClassifier(n_estimators=500, min_samples_split=2, max_features="auto", criterion="entropy"))])
    # ('randforest', xgboost.XGBClassifier(learning_rate=0.1,max_depth= 10,subsample=1.0, min_child_weight = 5,colsample_bytree = 0.2 ))])
    # exp_relax(pipline,trees,tree_labels,lable_problems, relax=1,cv=cv)

    # exper_name = model_name
    # args.train = train_file
    print()
    print(exper_name, flush=True)
    if args.train:
Ejemplo n.º 9
0
def main_experiment(ensembles):
    parser = argparse.ArgumentParser()
    parser.add_argument('--name', '-n', type=str, default="default_experiment", help='Experiment name')
    parser.add_argument('--config', '-c', type=str, default="", help='Configuration file')

    args = parser.parse_args()
    if args.config == "":
        parser.print_help()
        return
    args, seed, classes, saved_epochs,last_epoch = read_config(args.config)

    n_units = args.units
    batch_size = args.batchsize
    gpu = args.gpu
    models_base_folder = "saved_models"
    output_folder = os.path.join("results",args.folder)  # args.folder  #R"C:\Users\bms\PycharmProjects\stylemotery_code" #
    exper_name = args.name
    dataset_folder = os.path.join("dataset", args.dataset)
    seperate_trees = args.seperate
    model_name = args.model
    layers = args.layers
    dropout = args.dropout
    cell = "lstm"#args.cell
    residual = args.residual

    output_file = sys.stdout#open(os.path.join(output_folder, exper_name + "_results.txt"), mode="a")

    trees, tree_labels, lable_problems, tree_nodes = parse_src_files(dataset_folder, seperate_trees=seperate_trees)
    if args.train:
        rand_seed, classes = read_train_config(os.path.join("train", args.dataset, args.train))
        trees, tree_labels = pick_subsets(trees, tree_labels, classes=classes)
    else:
        if args.classes > -1:
            trees, tree_labels = pick_subsets(trees, tree_labels, labels=args.classes, seed=rand_seed, classes=None)

    if model_name == "lstm":
        model = RecursiveLSTM(n_units, len(classes), layers=layers, dropout=dropout, feature_dict=tree_nodes,
                              classes=classes, cell=cell, residual=residual)
    elif model_name == "bilstm":
        model = RecursiveBiLSTM(n_units, len(classes), layers=layers, dropout=dropout, feature_dict=tree_nodes,
                                classes=classes, cell=cell, residual=residual)
    elif model_name == "treelstm":
        model = RecursiveTreeLSTM(n_children=layers, n_units=n_units, n_label=len(classes), dropout=dropout,
                                  feature_dict=tree_nodes, classes=classes)
    else:
        print("No model was found")
        return

    models = []

    # for epoch in saved_epochs[::1][:ensembles]:
    # # load the model
    #     model_saved_name = "{0}_epoch_{1}".format(exper_name,epoch)
    #     output_file.write("load {0} ... \n".format(model_saved_name))
    #     saved_models = [m for m in os.listdir(os.path.join(models_base_folder,"lstm2"))
    #                     if m == model_saved_name + ".my"]
    #     if len(saved_models) > 0:
    #         # pick the best one
    #         model_saved_name = list(sorted(saved_models, key=lambda name: int(name.split(".")[0].split("_")[-1]), reverse=True))[0]
    #     else:
    #         print("No model was found to load")
    #         return
    #     path = os.path.join(models_base_folder,"lstm2",model_saved_name)
    #     serializers.load_npz(path, model)
    #     # if gpu >= 0:
    #     #     model.to_gpu()
    #     models.append(model)

    model_files = [f for f in os.listdir(os.path.join(models_base_folder,"bilstm")) if f.endswith(".my")]
    for model_saved_name in model_files:
    # load the model
    #     if exper_name in model_saved_name:
        output_file.write("load {0} ... \n".format(model_saved_name))
        path = os.path.join(models_base_folder,"bilstm",model_saved_name)
        serializers.load_npz(path, model)
        # if gpu >= 0:
        #     model.to_gpu()
        models.append(model)



    # trees, tree_labels = pick_subsets(trees, tree_labels, classes=classes)
    train_trees, train_lables, test_trees, test_lables, classes, cv = split_trees(trees, tree_labels, n_folds=5,
                                                                                  shuffle=True, seed=seed,
                                                                                  iterations=args.iterations)
    # print('Train')
    # output_file.write("Test  labels :- (%s,%s%%): %s\n" % (len(test_lables), (len(test_lables) / len(tree_labels)) * 100, test_lables))

    # output_file.write("{0:<10}{1:<20}\n".format("Relax", "test_accuracy"))
    # print('Relax evaluation: ')
    # for i in [1, 5, 10, 15]:
    #     test_accuracy, test_loss = evaluate_relax(model, test_trees, test_lables, batch_size=batch_size, progbar=True, relax=i)
    #     # test_accuracy, test_loss = evaluate(model, test_trees, test_lables, batch_size=batch_size)
    #     print()
    #     output_file.write("{0:<10}{1:<20.10f}\n".format(i,test_accuracy))
    #     output_file.flush()

    # print("One model:")
    # import numpy as np
    # classes_num = np.arange(len(classes))
    # test_accuracy, test_loss = evaluate(models[0], test_trees, test_lables,classes=classes_num, batch_size=batch_size)
    # test_accuracy, test_loss = evaluate(model, test_trees, test_lables, batch_size=batch_size)
    # output_file.write("{0:<20.10f}\n".format(test_accuracy))
    # output_file.flush()

    print("Ensmbel:")
    test_accuracy, test_loss = evaluate_ensemble(models, test_trees, test_lables, batch_size=batch_size)
    # test_accuracy, test_loss = evaluate(model, test_trees, test_lables, batch_size=batch_size)
    output_file.write("{0:<20.10f}\n".format(test_accuracy))
    output_file.flush()

    output_file.close()
Ejemplo n.º 10
0
from ast_tree.traverse import ast_print
from utils.dataset_utils import parse_src_files, get_basefolder, make_binary_tree
import os

if __name__ == "__main__":
    trees, tree_labels, lable_problems = parse_src_files(os.path.join(".." ,get_basefolder()))
    ast_print(trees[0])
    binary_tree = make_binary_tree(trees[0])
    ast_print(binary_tree)