Beispiel #1
0
def process_tbn(X_train, y_train, X_test, y_test):
    # divide into train and corresponding handles
    X_train, handles_train = X_train[:, 1:].astype(
        np.float), X_train[:, :1].flatten()
    X_test, handles_test = X_test[:, 1:].astype(
        np.float), X_test[:, :1].flatten()

    numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data(
        X_train, bioLen, numLen, tweetLen)
    #print (embedding_bio_train.shape, embedding_tweet_train.shape)
    y_train = to_float_cuda(
        y_train
    )  #to_float_cuda(y_train.reshape(-1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train)
    #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data(
        X_test, bioLen, numLen, tweetLen)
    y_test = to_self_cuda(
        y_test
    )  #to_float_cuda(y_test.reshape(-1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test)
    #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    bin_label = True  #False if sourceType == SourceType.age else True
    if processType == ProcessType.mlp:
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)

        index = y_train[:] > 1
        embedding_train_greater = embedding_train[index]
        y_train_greater = y_train[index]
        y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0)
        y_train_greater = to_float_cuda(y_train_greater)
        counter = [0, 0]
        model1 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model1)
        train_model(model1, embedding_train_greater, y_train_greater)
        print("--------1")

        index = y_train[:] <= 1
        embedding_train_smaller = embedding_train[index]
        y_train_smaller = y_train[index]
        #y_train_smaller = transfer_y(y_train_smaller, False, )
        #print (y_train_smaller.shape)
        model2 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model2)
        train_model(model2, embedding_train_smaller, y_train_smaller)
        #auc = eval_model(model2, embedding_train_smaller, y_train_smaller)
        #print (y_train_smaller)
        #print (auc)
        print("--------2")

        y_train_root = transfer_y(y_train, True, 1, 1, 0)
        model = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model)
        train_model(model, embedding_train, y_train_root)
        y_test_root = transfer_y(y_test, True, 1, 1, 0)
        #auc = eval_model(model, embedding_test, y_test_root)
        #print (auc)
        print("--------3")

        auc = han_eval_model([model1, model2, model], embedding_test, y_test)
    elif processType == ProcessType.name_c_tbn:
        l_out = 8
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        #print (handles_train)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(
            embedding_train.shape[1] + hidden_size,
            int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, embedding_train, train_names_idx, train_names_len,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, embedding_test, test_names_idx, test_names_len,
                         y_test)
    elif processType == ProcessType.tbn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        index = y_train[:] > 1
        embedding_train_greater, numerical_train_greater = embedding_train[
            index], numerical_train[index]
        y_train_greater = y_train[index]
        y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0)
        counter = [0, 0]
        lstm_model1 = LstmAttention(batch_size, hidden_size, embedding_length,
                                    l_out)
        #model1 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model1, bin_label)
        model1 = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model1)
        to_cuda(model1)
        print(y_train_greater)
        train_model(model1, numerical_train_greater, embedding_train_greater,
                    y_train_greater)
        print("--------1")

        index = y_train[:] <= 1
        embedding_train_smaller, numerical_train_smaller = embedding_train[
            index], numerical_train[index]
        y_train_smaller = y_train[index]
        lstm_model2 = LstmAttention(batch_size, hidden_size, embedding_length,
                                    l_out)
        #model2 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model2, bin_label)
        model2 = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model2)
        to_cuda(model2)
        train_model(model2, numerical_train_smaller, embedding_train_smaller,
                    y_train_smaller)
        print("--------2")

        y_train_root = transfer_y(y_train, True, 1, 1, 0)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        #model = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model, bin_label)
        model = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train_root)
        y_test_root = transfer_y(y_test, True, 1, 1, 0)
        #auc = eval_model(model, embedding_test, y_test_root)
        #print (auc)
        print("--------3")

        auc = han_eval_model([model1, model2, model], numerical_test,
                             embedding_test, y_test)
    elif processType == ProcessType.name:
        l_out = 8
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        print(train_names_idx, train_names_len)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out,
                                      lstm_model, bin_label)
        to_cuda(model)
        train_model(model, train_names_idx, train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, test_names_idx, test_names_len, y_test)
    elif processType == ProcessType.tbn_c_name_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [lstm_sub_model], lstm_model, bin_label)
        to_cuda(lstm_sub_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_names_idx, embedding_train,
                    train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_names_idx, embedding_test,
                         test_names_len, y_test)
    elif processType == ProcessType.tbnn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    elif processType == ProcessType.tbnn_e_att:
        #emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings("/home/yaguang/db/wiki_sort_emoji_hashtag/")
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        #emoji
        train_emoji_idx = divide_emojis(handles_train, emoji_input_ids)
        test_emoji_idx = divide_emojis(handles_test, emoji_input_ids)
        #print (emoji_embeddings)
        emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings,
                                  dropout=0.5)
        #emoji_cnn_model = CNN_NLP(vocab_size=dim)

        #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [emoji_cnn_model], lstm_model, bin_label)

        to_cuda(emoji_cnn_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_emoji_idx, embedding_train,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test,
                         y_test)
    elif processType == ProcessType.tbn_real_att:
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        l_out = 8
        #model = Attention(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, 768, bin_label)
        model = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    return auc
def process_tbn(X_train, y_train, X_test, y_test, X_test_mic, y_test_mic):
    # divide into train and corresponding handles
    X_train, handles_train = X_train[:, 1:].astype(
        np.float), X_train[:, :1].flatten()
    X_test, handles_test = X_test[:, 1:].astype(
        np.float), X_test[:, :1].flatten()

    numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data(
        X_train, bioLen, numLen, tweetLen)
    #print (embedding_bio_train.shape, embedding_tweet_train.shape)
    y_train = to_float_cuda(y_train.reshape(
        -1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train)
    #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data(
        X_test, bioLen, numLen, tweetLen)
    y_test = to_float_cuda(y_test.reshape(
        -1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test)
    #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    bin_label = False if sourceType == SourceType.age else True

    if processType == ProcessType.mlp:
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        model = MyMLP(embedding_train.shape[1], 20, D_out)
        to_cuda(model)
        train_model(model, embedding_train, y_train)
        auc = eval_model(model, embedding_test, y_test)
    elif processType == ProcessType.name_c_tbn:
        l_out = 8
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        print(handles_train)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(
            embedding_train.shape[1] + hidden_size,
            int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, embedding_train, train_names_idx, train_names_len,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, embedding_test, test_names_idx, test_names_len,
                         y_test)
    elif processType == ProcessType.tbn_att:
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        #embedding_test_mic = torch.stack((embedding_bio_test_mic, embedding_tweet_test_mic), axis=1)
        l_out = 8
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(l_out, int(l_out / 2), D_out, lstm_model,
                                      bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
        print(auc)
        auc = eval_model(model, numerical_test_mic, embedding_test_mic,
                         y_test_mic)
        print(auc)
    elif processType == ProcessType.name:
        l_out = 8
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out,
                                      lstm_model, bin_label)
        to_cuda(model)
        train_model(model, train_names_idx, train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, test_names_idx, test_names_len, y_test)
    elif processType == ProcessType.tbn_c_name_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [lstm_sub_model], lstm_model, bin_label)
        to_cuda(lstm_sub_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_names_idx, embedding_train,
                    train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_names_idx, embedding_test,
                         test_names_len, y_test)
    elif processType == ProcessType.tbnn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    elif processType == ProcessType.tbnn_e_att:
        emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings(
            "/home/yaguang/pattern/db/wiki_sort_emoji_hashtag/")
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        #emoji
        train_emoji_idx = divide_emojis(handles_train, emoji_input_ids)
        test_emoji_idx = divide_emojis(handles_test, emoji_input_ids)
        #emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings, dropout=0.5)
        emoji_cnn_model = CNN_NLP(vocab_size=dim)

        #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [emoji_cnn_model], lstm_model, bin_label)

        to_cuda(emoji_cnn_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_emoji_idx, embedding_train,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test,
                         test_names_len, y_test)
    return auc