Esempio n. 1
0
def divide_name(handles, handles2names):
    names = [handles2names[handle] for handle in handles]
    names_idx, names_len = embedAndPack(names)
    print(names_idx)
    names_idx = to_self_cuda(names_idx)
    names_len = names_len.flatten()
    return names_idx, names_len
def eval_model(model, test_names):
    idx = 0
    y_hat_test_class = []
    y_test = []
    model.eval()
    while idx < len(test_names):
        batch_test_names = test_names[idx:idx + batch_size]
        X_batch_test = []
        X_batch_test_emoji = []
        seq_lens = []
        for i in range(len(batch_test_names)):
            handle = batch_test_names[i][:-4].lower()
            pick_emd = txn.get(handle.encode())
            temp = pickle.loads(pick_emd)
            dates = [val[0] for val in temp][::-1]
            tweet_emb = [val[1:] for val in temp]
            #tweet_emb = pickle.loads(txn.get(handle.encode()))[:fix_seq_len]
            seq_lens.append(len(tweet_emb))
            while len(tweet_emb) < fix_seq_len:
                tweet_emb.append([0 for i in range(768)])
            emoji_idx = process_tweet(handle, dates, longest_emoji_len,
                                      word2idx, fix_seq_len)[::-1]
            X_batch_test_emoji.append(emoji_idx)
            X_batch_test.append(tweet_emb)
            y_test.append(map_attribute(handle))
        X_batch_test = to_float_cuda(X_batch_test)
        X_batch_test = X_batch_test.permute(1, 0, 2)
        X_batch_test_emoji = to_self_cuda(X_batch_test_emoji)
        y_hat_test = model(fix_seq_len, None, X_batch_test, seq_lens,
                           [(X_batch_test_emoji, None)])
        #print (y_hat_test)
        #print (y_hat_test.cpu().detach().numpy())
        if inference_type == InferenceType.age:
            target = np.argmax(y_hat_test.cpu().detach().numpy(), axis=1)
            for i in range(len(target)):
                y_hat_test_class.append(target[i])
        else:
            target = np.where(y_hat_test.cpu().detach().numpy() < 0.5, 0, 1)
            for i in range(len(target)):
                y_hat_test_class.append(target[i][0])
        #print ("epoch "+str(idx))
        idx += batch_size
    print(y_test, y_hat_test_class)
    f1 = f1_score(y_test, y_hat_test_class, average='macro')
    #auc = roc_auc_score(y_test, y_hat_test_class )
    return f1
Esempio n. 3
0
def process_tbn(X_train, y_train, X_test, y_test):
    # divide into train and corresponding handles
    X_train, handles_train = X_train[:, 1:].astype(
        np.float), X_train[:, :1].flatten()
    X_test, handles_test = X_test[:, 1:].astype(
        np.float), X_test[:, :1].flatten()

    numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data(
        X_train, bioLen, numLen, tweetLen)
    #print (embedding_bio_train.shape, embedding_tweet_train.shape)
    y_train = to_float_cuda(
        y_train
    )  #to_float_cuda(y_train.reshape(-1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train)
    #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data(
        X_test, bioLen, numLen, tweetLen)
    y_test = to_self_cuda(
        y_test
    )  #to_float_cuda(y_test.reshape(-1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test)
    #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    bin_label = True  #False if sourceType == SourceType.age else True
    if processType == ProcessType.mlp:
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)

        index = y_train[:] > 1
        embedding_train_greater = embedding_train[index]
        y_train_greater = y_train[index]
        y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0)
        y_train_greater = to_float_cuda(y_train_greater)
        counter = [0, 0]
        model1 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model1)
        train_model(model1, embedding_train_greater, y_train_greater)
        print("--------1")

        index = y_train[:] <= 1
        embedding_train_smaller = embedding_train[index]
        y_train_smaller = y_train[index]
        #y_train_smaller = transfer_y(y_train_smaller, False, )
        #print (y_train_smaller.shape)
        model2 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model2)
        train_model(model2, embedding_train_smaller, y_train_smaller)
        #auc = eval_model(model2, embedding_train_smaller, y_train_smaller)
        #print (y_train_smaller)
        #print (auc)
        print("--------2")

        y_train_root = transfer_y(y_train, True, 1, 1, 0)
        model = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model)
        train_model(model, embedding_train, y_train_root)
        y_test_root = transfer_y(y_test, True, 1, 1, 0)
        #auc = eval_model(model, embedding_test, y_test_root)
        #print (auc)
        print("--------3")

        auc = han_eval_model([model1, model2, model], embedding_test, y_test)
    elif processType == ProcessType.name_c_tbn:
        l_out = 8
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        #print (handles_train)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(
            embedding_train.shape[1] + hidden_size,
            int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, embedding_train, train_names_idx, train_names_len,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, embedding_test, test_names_idx, test_names_len,
                         y_test)
    elif processType == ProcessType.tbn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        index = y_train[:] > 1
        embedding_train_greater, numerical_train_greater = embedding_train[
            index], numerical_train[index]
        y_train_greater = y_train[index]
        y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0)
        counter = [0, 0]
        lstm_model1 = LstmAttention(batch_size, hidden_size, embedding_length,
                                    l_out)
        #model1 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model1, bin_label)
        model1 = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model1)
        to_cuda(model1)
        print(y_train_greater)
        train_model(model1, numerical_train_greater, embedding_train_greater,
                    y_train_greater)
        print("--------1")

        index = y_train[:] <= 1
        embedding_train_smaller, numerical_train_smaller = embedding_train[
            index], numerical_train[index]
        y_train_smaller = y_train[index]
        lstm_model2 = LstmAttention(batch_size, hidden_size, embedding_length,
                                    l_out)
        #model2 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model2, bin_label)
        model2 = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model2)
        to_cuda(model2)
        train_model(model2, numerical_train_smaller, embedding_train_smaller,
                    y_train_smaller)
        print("--------2")

        y_train_root = transfer_y(y_train, True, 1, 1, 0)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        #model = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model, bin_label)
        model = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train_root)
        y_test_root = transfer_y(y_test, True, 1, 1, 0)
        #auc = eval_model(model, embedding_test, y_test_root)
        #print (auc)
        print("--------3")

        auc = han_eval_model([model1, model2, model], numerical_test,
                             embedding_test, y_test)
    elif processType == ProcessType.name:
        l_out = 8
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        print(train_names_idx, train_names_len)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out,
                                      lstm_model, bin_label)
        to_cuda(model)
        train_model(model, train_names_idx, train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, test_names_idx, test_names_len, y_test)
    elif processType == ProcessType.tbn_c_name_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [lstm_sub_model], lstm_model, bin_label)
        to_cuda(lstm_sub_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_names_idx, embedding_train,
                    train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_names_idx, embedding_test,
                         test_names_len, y_test)
    elif processType == ProcessType.tbnn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    elif processType == ProcessType.tbnn_e_att:
        #emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings("/home/yaguang/db/wiki_sort_emoji_hashtag/")
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        #emoji
        train_emoji_idx = divide_emojis(handles_train, emoji_input_ids)
        test_emoji_idx = divide_emojis(handles_test, emoji_input_ids)
        #print (emoji_embeddings)
        emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings,
                                  dropout=0.5)
        #emoji_cnn_model = CNN_NLP(vocab_size=dim)

        #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [emoji_cnn_model], lstm_model, bin_label)

        to_cuda(emoji_cnn_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_emoji_idx, embedding_train,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test,
                         y_test)
    elif processType == ProcessType.tbn_real_att:
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        l_out = 8
        #model = Attention(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, 768, bin_label)
        model = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    return auc
def train_model(model, train_names, test_names):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss(
    ) if inference_type == InferenceType.age else nn.BCELoss()
    #model.train()
    for epoch in range(epochs):
        idx = 0
        counter = [0, 0, 0, 0, 0]
        model.train()
        train_names = random.sample(train_names, len(train_names))
        while idx < len(train_names):
            batch_train_names = train_names[idx:idx + batch_size]
            X_batch_train = []
            y_batch_train = []
            X_batch_train_emoji = []
            seq_lens = []
            for i in range(len(batch_train_names)):
                handle = batch_train_names[i][:-4].lower()
                #print (handle)
                pick_emd = txn.get(handle.encode())
                temp = pickle.loads(pick_emd)
                #print (temp)
                dates = [val[0] for val in temp][::-1]
                tweet_emb = [val[1:] for val in temp]
                seq_lens.append(len(tweet_emb))
                while len(tweet_emb) < fix_seq_len:
                    tweet_emb.append([0 for i in range(768)])
                emoji_idx = process_tweet(handle, dates, longest_emoji_len,
                                          word2idx, fix_seq_len)[::-1]
                X_batch_train.append(tweet_emb)
                X_batch_train_emoji.append(emoji_idx)
                y_batch_train.append(map_attribute(handle))
                counter[map_attribute(handle)] += 1

            X_batch_train = to_float_cuda(X_batch_train)
            X_batch_train_emoji = to_self_cuda(X_batch_train_emoji)
            #print ("X_batch_train")
            #print (X_batch_train.shape)
            X_batch_train = X_batch_train.permute(1, 0, 2)
            #print (X_batch_train.shape)
            #X_batch_train = pack_padded_sequence(X_batch_train)
            y_batch_train = to_float_cuda(y_batch_train).reshape(
                -1,
                1) if inference_type != InferenceType.age else to_self_cuda(
                    y_batch_train)
            #y_batch_train = to_float_cuda(y_batch_train).reshape(-1, 1)
            y_pred = model(fix_seq_len, None, X_batch_train, seq_lens,
                           [(X_batch_train_emoji, None)])
            loss = loss_fn(y_pred, y_batch_train)
            #print (y_pred, y_batch_train)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            idx += batch_size
            print("epoch " + str(epoch) + " with batch " +
                  str(int(idx / batch_size)) + " is " + str(loss.item()))
            #print (eval_mem_model(model,X_batch_train, y_batch_train, seq_lens))
        #break
        auc = eval_model(model, X_test_names)
        print(auc)
    return loss.item()
Esempio n. 5
0
def process_tbn(X_train, y_train, X_test, y_test, X_test_mic, y_test_mic):
    # divide into train and corresponding handles
    X_train, handles_train = X_train[:, 1:].astype(
        np.float), X_train[:, :1].flatten()
    X_test, handles_test = X_test[:, 1:].astype(
        np.float), X_test[:, :1].flatten()

    numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data(
        X_train, bioLen, numLen, tweetLen)
    #print (embedding_bio_train.shape, embedding_tweet_train.shape)
    y_train = to_float_cuda(y_train.reshape(
        -1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train)
    #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data(
        X_test, bioLen, numLen, tweetLen)
    y_test = to_float_cuda(y_test.reshape(
        -1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test)
    #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    bin_label = False if sourceType == SourceType.age else True

    if processType == ProcessType.mlp:
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        model = MyMLP(embedding_train.shape[1], 20, D_out)
        to_cuda(model)
        train_model(model, embedding_train, y_train)
        auc = eval_model(model, embedding_test, y_test)
    elif processType == ProcessType.name_c_tbn:
        l_out = 8
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        print(handles_train)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(
            embedding_train.shape[1] + hidden_size,
            int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, embedding_train, train_names_idx, train_names_len,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, embedding_test, test_names_idx, test_names_len,
                         y_test)
    elif processType == ProcessType.tbn_att:
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        #embedding_test_mic = torch.stack((embedding_bio_test_mic, embedding_tweet_test_mic), axis=1)
        l_out = 8
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(l_out, int(l_out / 2), D_out, lstm_model,
                                      bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
        print(auc)
        auc = eval_model(model, numerical_test_mic, embedding_test_mic,
                         y_test_mic)
        print(auc)
    elif processType == ProcessType.name:
        l_out = 8
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out,
                                      lstm_model, bin_label)
        to_cuda(model)
        train_model(model, train_names_idx, train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, test_names_idx, test_names_len, y_test)
    elif processType == ProcessType.tbn_c_name_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [lstm_sub_model], lstm_model, bin_label)
        to_cuda(lstm_sub_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_names_idx, embedding_train,
                    train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_names_idx, embedding_test,
                         test_names_len, y_test)
    elif processType == ProcessType.tbnn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    elif processType == ProcessType.tbnn_e_att:
        emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings(
            "/home/yaguang/pattern/db/wiki_sort_emoji_hashtag/")
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        #emoji
        train_emoji_idx = divide_emojis(handles_train, emoji_input_ids)
        test_emoji_idx = divide_emojis(handles_test, emoji_input_ids)
        #emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings, dropout=0.5)
        emoji_cnn_model = CNN_NLP(vocab_size=dim)

        #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [emoji_cnn_model], lstm_model, bin_label)

        to_cuda(emoji_cnn_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_emoji_idx, embedding_train,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test,
                         test_names_len, y_test)
    return auc
def divide_emojis(handles, input_ids):
    handle_ids = [input_ids[handle] for handle in handles]
    #print (handles[:4])
    #print ([val.shape for val in handle_ids[:4]])
    return to_self_cuda(np.concatenate(handle_ids, axis=0))