def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


files_paths = [
    'data/sensitive1.tsv', 'data/sensitive2.tsv', 'data/sensitive3.tsv'
]
corpus = Corpus(files_paths)
sents, tags = corpus.read_tsv()

data = []

for i, sent in enumerate(sents):
    tmp = []
    for j, t in enumerate(nltk.pos_tag(sent)):
        tmp.append((t[0], t[1], tags[i][j]))
    data.append(tmp)

random.shuffle(data)

X_train = [sent2features(s) for s in data[996:]]
y_train = [sent2labels(s) for s in data[996:]]
import os
import pandas as pd
from corpus_process import Corpus

data_dir = os.path.join(os.pardir, 'data')
tsv_files = ['/sensitive1.tsv', '/sensitive2.tsv', '/sensitive3.tsv']

corpus = Corpus([data_dir + f for f in tsv_files])
Exemple #3
0
            infos = line.split()
            wd = infos[0]
            vec = np.array(infos[1:]).astype(np.float)
            pretrain_words[wd] = vec
    word_idx = corp.r_wordIDs
    vocab_num = corp.wordNum
    weights_matrix = np.zeros((vocab_num, embedding_dim))
    for idx in word_idx.keys():
        try:
            weights_matrix[idx] = pretrain_words[word_idx[idx]]
        except KeyError:
            weights_matrix[idx] = np.random.normal(size=(embedding_dim, ))
    if torch.cuda.is_available():  # run in GPU
        return torch.Tensor(weights_matrix).cuda()
    else:
        return torch.Tensor(weights_matrix)


if __name__ == '__main__':
    train_files = [
        'Datafiles/technology_201501.data', 'Datafiles/technology_201502.data',
        'Datafiles/technology_201503.data', 'Datafiles/technology_201504.data'
    ]
    test_file = 'Datafiles/technology_201505_test.data'
    valid_file = 'Datafiles/technology_201505_valid.data'
    corp = Corpus(train_files, test_file, valid_file)
    print corp.userNum, len(corp.user_history.keys())
    his = create_history_tensor(corp.userNum, corp.user_history)
    print his.size()
    print his[-1]
import os, json
from corpus_process import Corpus

files = ['sensitive1.tsv', 'sensitive2.tsv', 'sensitive3.tsv']
tsv_path = [os.path.join(os.pardir, 'data', f) for f in files]
vocabulary_path = os.path.join(os.pardir, 'data', 'vocabulary_corpus.json')

corpus = Corpus(tsv_path)

x, _ = corpus.read_tsv()

vocabulary = {}
for sent in x:
    for tok in sent:
        if not tok in vocabulary:
            vocabulary[tok] = len(vocabulary) + 1

with open(vocabulary_path, 'w') as fd:
    json.dump(vocabulary, fd)
from torch.utils.data import DataLoader, random_split
from transformers import BertForTokenClassification, AdamW
from pytorch_pretrained_bert import BertAdam
import torch
from seqeval.metrics import accuracy_score, f1_score, classification_report
from seqeval.metrics import classification_report

files_train_paths = ['data/sensitive1.tsv', 'data/sensitive3.tsv']
files_test_path = ['data/sensitive2.tsv']
pretrained_dataset = 'bert-base-uncased'

# ========================================
#               DATA
# ========================================

data_train = Corpus(files_train_paths).get_dataset_bert()
data_test = Corpus(files_test_path).get_dataset_bert()

val_size = int(0.1 * len(data_train))
train_size = len(data_train) - val_size

train_dataset, val_dataset = random_split(data_train, [train_size, val_size])

batch_size = 64

train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
Exemple #6
0
def train(config):
    print 'Start training: ' + config.filename + ' ' + config.modelname
    trainfile, testfile, validfile = find_files(config.filename, config.month_num)
    modelname = config.modelname
    # Process the corpus and prepare pretrained parameters (if any)
    corp = Corpus(trainfile, testfile, validfile)
    config.user_num, config.conv_num, config.vocab_num = corp.userNum, corp.convNum, corp.wordNum
    train_convs, train_convs_per_user, test_data, dev_data = form_dataset(corp, config.batch_size)
    conv_data = create_tensor(corp.convs)
    if config.pretrained_file == 'NULL':
        embedding_matrix = None
    else:
        embedding_matrix = create_pretrain_embeddings(corp, config.embedding_dim, config.pretrained_file)
    # Set the model and saving path
    if modelname == 'DCR':
        user_history = create_history_tensor(corp.userNum, corp.user_history)
        arcs = create_arc_info(conv_data, no_time=True)
        model = DCR(config, conv_data, user_history, arcs, embedding_matrix)
        path_name = str(config.batch_size) + "_" + str(config.factor_dim) + "_" + str(config.embedding_dim) + "_" + \
            str(config.kernal_num) + "_" + str(config.hidden_dim) + "_" + str(config.gcn_layers_num) + "_" + \
            str(config.neg_sample_num) + "_" + str(config.lr) + "_" + str(int(config.pos_weight)) + "_" + config.att
        if config.month_num != 4:
            path_name += "_" + str(config.month_num) + "m_" + str(config.runtime)
        else:
            path_name += "_" + str(config.runtime)
        if config.pretrained_file == 'NULL':
            path_name += "_npembed"
        if config.no_lstm:
            path_name += "_nlstm"
        if config.no_gcn:
            path_name += "_ngcn"
        if config.mlp_layers_num == 0:
            path_name += "_nmlp"
    else:
        print 'Modelname Wrong!'
        exit()
    res_path = "BestResults/" + modelname + "/" + config.filename + "/"
    mod_path = "BestModels/" + modelname + "/" + config.filename + "/"
    if not os.path.isdir(res_path):
        os.makedirs(res_path)
    if not os.path.isdir(mod_path):
        os.makedirs(mod_path)
    mod_path += path_name + '.model'
    res_path += path_name + '.data'

    # Set the optimizing parameters
    loss_weights = torch.Tensor([1, config.pos_weight])
    if torch.cuda.is_available():  # run in GPU
        model = model.cuda()
        loss_weights = loss_weights.cuda()
    if config.optim == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=config.lr)
    else:
        optimizer = optim.SGD(model.parameters(), lr=config.lr)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 1.0 / ((epoch + 1) ** 0.5))
    best_dev_map = 0.0
    best_epoch = -1
    best_epoch_loss = 999999
    no_improve = 0
    # Begin training
    for epoch in range(config.max_epoch):
        train_data = MyDataset(train_convs, train_convs_per_user, train=True, num_sampling=config.neg_sample_num)
        train_loader = data.DataLoader(train_data, batch_size=config.batch_size, num_workers=0, shuffle=True)
        loss = train_epoch(model, train_loader, loss_weights, optimizer, epoch)
        dev_map = evaluate(model, dev_data, dev=True)
        if dev_map > best_dev_map:
            no_improve = 0
            best_dev_map = dev_map
            os.system('rm ' + mod_path)
            best_epoch = epoch
            best_epoch_loss = loss
            print('New Best Dev!!! MAP: %g' % best_dev_map)
            torch.save(model.state_dict(), mod_path)
        else:
            no_improve += 1
            print('Current Best Dev MAP: %g, Dev MAP: %g' % (best_dev_map, dev_map))
        if no_improve > 8:
            break
        scheduler.step()
    model.load_state_dict(torch.load(mod_path))
    # Evaluate and save results
    res = evaluate(model, test_data)
    print('Result in test set: MAP: %g, Precision@1: %g, Precision@5: %g, nDCG@5: %g, nDCG@10: %g, MRR: %g' %
          (res[0], res[1], res[2], res[3], res[4], res[5]))
    with open(res_path, 'w') as f:
        f.write('MAP\tPre@1\tPre@5\tnDCG@5\tnDCG@10\tMRR\n')
        f.write('%g\t%g\t%g\t%g\t%g\t%g\n' % (res[0], res[1], res[2], res[3], res[4], res[5]))
        if modelname != "Pop" and modelname != "Random":
            f.write('Dev MAP: %g\n' % best_dev_map)
            f.write('Best epoch: %d\n' % best_epoch)
            f.write('Best epoch loss: %g\n' % best_epoch_loss)