Ejemplo n.º 1
0
def main(args):
    print('#' * 70)
    print('Embedding Method: %s, Evaluation Task: %s' %
          (args.method, args.task))
    print('#' * 70)

    if args.task == 'link-prediction':
        G, G_train, testing_pos_edges, train_graph_filename = split_train_test_graph(
            args.input, args.seed, weighted=args.weighted)
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)
        embedding_look_up = load_embedding(args.output)
        time1 = time.time()
        print('Begin evaluation...')
        result = LinkPrediction(embedding_look_up, G, G_train,
                                testing_pos_edges, args.seed)
        eval_time = time.time() - time1
        print('Prediction Task Time: %.2f s' % eval_time)
        os.remove(train_graph_filename)

    elif args.task == 'node-classification':
        if not args.label_file:
            raise ValueError("No input label file. Exit.")
        node_list, labels = read_node_labels(args.label_file)
        train_graph_filename = args.input
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)
        embedding_look_up = load_embedding(args.output, node_list)
        time1 = time.time()
        print('Begin evaluation...')
        result = NodeClassification(embedding_look_up, node_list, labels,
                                    args.testingratio, args.seed)
        eval_time = time.time() - time1
        print('Prediction Task Time: %.2f s' % eval_time)
    else:
        train_graph_filename = args.input
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)

    if args.eval_result_file and result:
        _results = dict(
            input=args.input,
            task=args.task,
            method=args.method,
            dimension=args.dimensions,
            user=getpass.getuser(),
            date=datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'),
            seed=args.seed,
        )

        if args.task == 'link-prediction':
            ##### ADDED: variables to store prediction and ROC results
            auc_roc, auc_pr, accuracy, f1, prediction, fpr, tpr = result
            _results['results'] = dict(auc_roc=auc_roc,
                                       auc_pr=auc_pr,
                                       accuracy=accuracy,
                                       f1=f1
                                       #pred=prediction,
                                       #fpr=fpr,
                                       #tpr=tpr
                                       )
        else:
            accuracy, f1_micro, f1_macro = result
            _results['results'] = dict(
                accuracy=accuracy,
                f1_micro=f1_micro,
                f1_macro=f1_macro,
            )

        with open(args.eval_result_file, 'a+') as wf:
            print(json.dumps(_results, sort_keys=True), file=wf)

        ##### ADDED: Write predictions to predictions output file #####
        with open(args.predictions, 'w') as pf:
            np.savetxt(pf, prediction, fmt='%s', delimiter=',')
        pf.close()

        ##### ADDED: Write FPR and TPR to ROC output file #####
        fpr = list(fpr)
        tpr = list(tpr)
        roc = [fpr, tpr]
        with open(args.roc, 'w') as rf:
            writer = csv.writer(rf)
            writer.writerows(roc)
        rf.close()
Ejemplo n.º 2
0
def main(args):
    print('#' * 70)
    print('Embedding Method: %s, Evaluation Task: %s' %
          (args.method, args.task))
    print('#' * 70)

    if args.task == 'link-prediction':
        G, G_train, testing_pos_edges, train_graph_filename = split_train_test_graph(
            args.input, args.seed, weighted=args.weighted)
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)
        embedding_look_up = load_embedding(args.output)
        time1 = time.time()
        print('Begin evaluation...')
        result = LinkPrediction(embedding_look_up, G, G_train,
                                testing_pos_edges, args.seed)
        eval_time = time.time() - time1
        print('Prediction Task Time: %.2f s' % eval_time)
        os.remove(train_graph_filename)
    elif args.task == 'node-classification':
        if not args.label_file:
            raise ValueError("No input label file. Exit.")
        node_list, labels = read_node_labels(args.label_file)
        train_graph_filename = args.input
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)
        embedding_look_up = load_embedding(args.output, node_list)
        time1 = time.time()
        print('Begin evaluation...')
        result = NodeClassification(embedding_look_up, node_list, labels,
                                    args.testingratio, args.seed)
        eval_time = time.time() - time1
        print('Prediction Task Time: %.2f s' % eval_time)
    else:
        train_graph_filename = args.input
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)

    if args.eval_result_file and result:
        _results = dict(
            input=args.input,
            task=args.task,
            method=args.method,
            dimension=args.dimensions,
            user=getpass.getuser(),
            date=datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'),
            seed=args.seed,
        )

        if args.task == 'link-prediction':
            auc_roc, auc_pr, accuracy, f1 = result
            _results['results'] = dict(
                auc_roc=auc_roc,
                auc_pr=auc_pr,
                accuracy=accuracy,
                f1=f1,
            )
        else:
            accuracy, f1_micro, f1_macro = result
            _results['results'] = dict(
                accuracy=accuracy,
                f1_micro=f1_micro,
                f1_macro=f1_macro,
            )

        with open(args.eval_result_file, 'a+') as wf:
            print(json.dumps(_results, sort_keys=True), file=wf)
Ejemplo n.º 3
0
def do_evaluation(
    *,
    input_path,
    training_path: Optional[str] = None,
    testing_path: Optional[str] = None,
    method,
    prediction_task,
    dimensions: int = 300,
    number_walks: int = 8,
    walk_length: int = 8,
    window_size: int = 4,
    p: float = 1.5,
    q: float = 2.1,
    alpha: float = 0.1,
    beta: float = 4,
    epochs: int = 5,
    kstep: int = 4,
    order: int = 3,
    embeddings_path: Optional[str] = None,
    predictive_model_path: Optional[str] = None,
    training_model_path: Optional[str] = None,
    evaluation_file: Optional[str] = None,
    classifier_type: Optional[str] = None,
    weighted: bool = False,
    labels_file,
):
    """Train and evaluate an NRL model."""
    if prediction_task == 'link_prediction':
        node_list = None
        labels = None
        graph, graph_train, testing_pos_edges, train_graph_filename = create_graphs(
            input_path=input_path,
            training_path=training_path,
            testing_path=testing_path,
            weighted=weighted,
        )
    else:
        if not labels_file:
            raise ValueError("No input label file. Exit.")
        node_list, labels = read_node_labels(labels_file)
        train_graph_filename = input_path
        graph, graph_train, testing_pos_edges = None, None, None

    model = embedding_training(
        train_graph_filename=train_graph_filename,
        method=method,
        dimensions=dimensions,
        number_walks=number_walks,
        walk_length=walk_length,
        window_size=window_size,
        p=p,
        q=q,
        alpha=alpha,
        beta=beta,
        epochs=epochs,
        kstep=kstep,
        order=order,
        weighted=weighted,
    )
    if training_model_path is not None:
        model.save_model(training_model_path)
    if embeddings_path is not None:
        model.save_embeddings(embeddings_path)
    if method == 'LINE':
        embeddings = model.get_embeddings_train()
    else:
        embeddings = model.get_embeddings()

    _results = dict(
        input=input_path,
        method=method,
        dimension=dimensions,
        user=getpass.getuser(),
        date=datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'),
    )
    if prediction_task == 'link_prediction':
        auc_roc, auc_pr, accuracy, f1, mcc = pipeline.do_link_prediction(
            embeddings=embeddings,
            original_graph=graph,
            train_graph=graph_train,
            test_pos_edges=testing_pos_edges,
            save_model=predictive_model_path,
            classifier_type=classifier_type,
        )
        _results['results'] = dict(
            auc_roc=auc_roc,
            auc_pr=auc_pr,
            accuracy=accuracy,
            f1=f1,
            mcc=mcc,
        )
    else:
        accuracy, macro_f1, micro_f1, mcc = pipeline.do_node_classification(
            embeddings=embeddings,
            node_list=node_list,
            labels=labels,
            save_model=predictive_model_path,
            classifier_type=classifier_type,
        )
        _results['results'] = dict(
            accuracy=accuracy,
            macro_f1=macro_f1,
            micro_f1=micro_f1,
            mcc=mcc,
        )
    if evaluation_file is not None:
        json.dump(_results, evaluation_file, sort_keys=True, indent=2)
    return _results
Ejemplo n.º 4
0
def main(trials=None):
    args = parse_args()
    if trials is not None:
        params = {
            'C': trials.suggest_loguniform('C', 1e-10, 1e10),
            'window_size': trials.suggest_int('window_size', 1, 20),
            'pro_steps': trials.suggest_int('pro_steps', 1, 20),
            'pro_mu': trials.suggest_uniform('pro_mu', -1.0, 1.0),
            'pro_theta': trials.suggest_uniform('pro_theta', -1.0, 1.0),
            'output': "%s_trial_%s" % (args.output, str(trials.number))
        }
        dargs = vars(args)
        dargs.update(params)
    print(args)

    seed = args.seed
    random.seed(seed)
    np.random.seed(seed)
    print('#' * 70)
    print('Embedding Method: %s, Evaluation Task: %s' %
          (args.method, args.task))
    print('#' * 70)

    if args.task == 'link-prediction':
        G, G_train, testing_pos_edges, train_graph_filename = split_train_test_graph(
            args.input,
            args.seed,
            weighted=args.weighted,
            trial_number=trials.number)
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)
        embedding_look_up = load_embedding(args.output)
        time1 = time.time()
        print('Begin evaluation...')
        result = LinkPrediction(embedding_look_up,
                                G,
                                G_train,
                                testing_pos_edges,
                                args.seed,
                                C=args.C)
        eval_time = time.time() - time1
        print('Prediction Task Time: %.2f s' % eval_time)
        os.remove(train_graph_filename)
    elif args.task == 'node-classification':
        if not args.label_file:
            raise ValueError("No input label file. Exit.")
        node_list, labels = read_node_labels(args.label_file)
        train_graph_filename = args.input
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)
        embedding_look_up = load_embedding(args.output, node_list)
        time1 = time.time()
        print('Begin evaluation...')
        result = NodeClassification(embedding_look_up,
                                    node_list,
                                    labels,
                                    args.testingratio,
                                    args.seed,
                                    C=args.C)
        eval_time = time.time() - time1
        print('Prediction Task Time: %.2f s' % eval_time)
    else:
        train_graph_filename = args.input
        time1 = time.time()
        embedding_training(args, train_graph_filename)
        embed_train_time = time.time() - time1
        print('Embedding Learning Time: %.2f s' % embed_train_time)

    if args.eval_result_file and result:
        _results = dict(
            input=args.input,
            task=args.task,
            method=args.method,
            dimension=args.dimensions,
            user=getpass.getuser(),
            date=datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'),
            seed=args.seed,
        )

        if args.task == 'link-prediction':
            auc_roc, auc_pr, accuracy, f1 = result
            _results['results'] = dict(
                auc_roc=auc_roc,
                auc_pr=auc_pr,
                accuracy=accuracy,
                f1=f1,
            )
        else:
            accuracy, f1_micro, f1_macro = result
            _results['results'] = dict(
                accuracy=accuracy,
                f1_micro=f1_micro,
                f1_macro=f1_macro,
            )

        with open(args.eval_result_file, 'a+') as wf:
            print(json.dumps(_results, sort_keys=True), file=wf)

        if args.task == 'link-prediction':
            return auc_roc
        else:
            return accuracy
Ejemplo n.º 5
0
def train_model(
    *,
    input_path,
    method,
    dimensions: int = 300,
    number_walks: int = 8,
    walk_length: int = 8,
    window_size: int = 4,
    p: float = 1.5,
    q: float = 2.1,
    alpha: float = 0.1,
    beta: float = 4,
    epochs: int = 5,
    kstep: int = 4,
    order: int = 3,
    embeddings_path: Optional[str] = None,
    predictive_model_path: Optional[str] = None,
    training_model_path: Optional[str] = None,
    classifier_type: Optional[str] = None,
    weighted: bool = False,
    labels_file: Optional[str] = None,
    prediction_task,
):
    """Train a graph with an NRL model."""
    node_list, labels = None, None
    if prediction_task == 'node_classification':
        if not labels_file:
            raise ValueError("No input label file. Exit.")
        node_list, labels = read_node_labels(labels_file)
    model = embedding_training(
        train_graph_filename=input_path,
        method=method,
        dimensions=dimensions,
        number_walks=number_walks,
        walk_length=walk_length,
        window_size=window_size,
        p=p,
        q=q,
        alpha=alpha,
        beta=beta,
        epochs=epochs,
        kstep=kstep,
        order=order,
        weighted=weighted,
    )
    if training_model_path is not None:
        model.save_model(training_model_path)
    model.save_embeddings(embeddings_path)
    original_graph = nx.read_edgelist(input_path)
    if method == 'LINE':
        embeddings = model.get_embeddings_train()
    else:
        embeddings = model.get_embeddings()
    if prediction_task == 'link_prediction':
        pipeline.create_prediction_model(
            embeddings=embeddings,
            original_graph=original_graph,
            save_model=predictive_model_path,
            classifier_type=classifier_type,
        )
    else:
        pipeline.do_node_classification(
            embeddings=embeddings,
            node_list=node_list,
            labels=labels,
            classifier_type=classifier_type,
            save_model=predictive_model_path,
        )