Example #1
0
    def build_data(args):
        print("Building dataset...")
        if not os.path.exists(args.model_dir):
            os.mkdir(args.model_dir)
        vocab = Vocab(wl_th=args.wl_th, wcutoff=args.wcutoff)

        vocab.build(fname=args.train_file,
                    idf_file=args.idf_file,
                    firstline=False,
                    limit=args.sent_limit)
        args.vocab = vocab
        if args.word_emb_file is not None:
            scale = np.sqrt(3.0 / args.word_dim)
            args.word_pretrained = Embeddings.get_W(args.word_emb_file,
                                                    args.word_dim, vocab.w2i,
                                                    scale)
        else:
            args.word_pretrained = None

        if os.path.exists(args.idf_file):
            print("Load idf file ...")
            args.idf_embs = Embeddings.get_W(args.idf_file, 1, vocab.w2i, 0)
        else:
            args.idf_embs = None

        SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args))
        return args
Example #2
0
    def load_data(self, datafile):

        dataset = pd.read_csv(datafile)
        if self.debug:
            dataset = dataset.iloc[:3000]

        text = 'comment_text'
        self.X = dataset[text].values

        labels = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]
        # labels = ['severe_toxic']
        assert (len(labels) == self.config.label_size)
        self.y = dataset[labels].values
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=0.1, random_state=124)

        ## Build the vocabulary using the train data.
        self.vocab = Vocab()
        train_sents = [get_words(line) for line in self.X_train]
        self.vocab.construct(list(itertools.chain.from_iterable(train_sents)),
                             threshold=self.config.min_word_freq)
        print('Training on {} samples and validating on {} samples'.format(
            len(self.X_train), len(self.X_val)))
        print()

        self.embedding_matrix = np.random.uniform(
            -0.005, 0.005, size=[len(self.vocab),
                                 self.config.embed_size]).astype('float32')
        with tf.variable_scope("Embeddings") as scope:
            embedding = tf.get_variable("Embeds",
                                        initializer=self.embedding_matrix,
                                        dtype=tf.float32)

        if self.debug:
            return

        ## Populate embedding matrix from pre-trained word embeddings
        pretrained_index = {}
        with open('./WordVectors/crawl-300d-2M.vec') as fh:
            for line in fh:
                word_vec = line.strip().split()
                word = word_vec[0]
                vector = np.asarray(word_vec[1:], dtype='float32')
                pretrained_index[word] = vector

        pw = 0.0

        for word, idx in self.vocab.word_to_idx.items():
            pretrained_vector = pretrained_index.get(word)
            if pretrained_vector is not None:
                self.embedding_matrix[idx] = pretrained_vector
                pw += 1

        print("Found pretrained vectors for {:.2f}% of data".format(
            pw / len(self.vocab) * 100))
        del pretrained_index  ## Done only for memory constraint. Don't do this!!
Example #3
0
def main():
    input_file = "data/train.txt"
    vocab_file = "data/vocab"
    embedding_file = "data/glove.npz"
    glove_file = "data/glove.840B.300d.txt"
    dict_file = "data/dict.p"
    max_vocab_size = 5e4
    Vocab.build_vocab(input_file, vocab_file, dict_file, glove_file,
                      embedding_file, max_vocab_size)
Example #4
0
def encode_sentence(data_dir):
    vocab = Vocab(os.path.join(data_dir, 'dict_cleaned.txt'))
    split_paths = {}
    for split in ['train', 'test']:
        split_paths[split] = os.path.join(data_dir, split)
        encodes = []
        with open(os.path.join(split_paths[split], 'sents.txt'), 'r') as sf:
            for line in sf.readlines():
                sentence = line.strip().split()
                index = [str(vocab.encode(word)) for word in sentence]
                encode = " ".join(index)
                encodes.append(encode)

        with open(os.path.join(split_paths[split], 'index.txt'), 'w') as wf:
            wf.writelines('\n'.join(encodes))
Example #5
0
def load_mr(data_dir):
    voc = Vocab(os.path.join(data_dir, 'dict_cleaned.txt'))

    split_paths = {}
    for split in ["train", "test"]:
        split_paths[split] = os.path.join(data_dir, split)

    data = {}
    max_sentence_length = 0
    count = 0
    sumlen = 0

    for split, path in split_paths.iteritems():
        sentencepath = os.path.join(path, "index.txt")
        labelpath = os.path.join(path, "labels.txt")

        splitdata = []
        with open(sentencepath, 'r') as sf, open(labelpath, 'r') as lf:
            for line, label in zip(sf.readlines(), lf.readlines()):
                sentence = line.strip()
                pair = {}
                pair['sentence'] = sentence
                pair['label'] = int(label.strip())

                splitdata.append(pair)
                if len(sentence) > max_sentence_length:
                    max_sentence_length = len(sentence)
                sumlen += len(sentence)
                count += 1
        data[split] = splitdata

    average_len = int(sumlen / count)
    return data, voc, max_sentence_length, average_len
Example #6
0
def main():
    config = Config()
    vocab = Vocab(config.dict_file)
    dev_q, dev_c, dev_s, dev_spans, dev_s_idx, dev_answerable = load_data(
        config.dev_file, vocab, config.debug)
    dev_data = list(
        zip(dev_q, dev_c, dev_s, dev_s_idx, dev_answerable, dev_spans))
    ssnet = SSQANet(config)
    ssnet.build_model()
    ssnet.restore_session(config.dir_model)
    batches = batch_loader(dev_data, config.batch_size, shuffle=False)
    acc_history = []
    em_history = []
    for batch in batches:
        batch_q, batch_c, batch_s, batch_s_idx, batch_ans, batch_spans = zip(
            *batch)
        question_lengths, padded_q = zero_padding(batch_q, level=1)
        context_lengths, padded_c = zero_padding(batch_c, level=1)
        sequence_lengths, sentence_lengths, padded_s = zero_padding(batch_s,
                                                                    level=2)

        batch_acc, batch_em, batch_loss = ssnet.eval(
            padded_q, question_lengths, padded_c, context_lengths, padded_s,
            sequence_lengths, sentence_lengths, batch_s_idx, batch_ans,
            batch_spans)
        acc_history.append(batch_acc)
        em_history.append(batch_em)

    dev_acc = np.mean(acc_history)
    dev_em = np.mean(em_history)
    print("classification acc :{}".format(dev_acc))
    print("EM :{}".format(dev_em))
Example #7
0
    def __init__(self,
                 model,
                 optimizer,
                 train_dataset,
                 test_dataset,
                 num_folds=config.num_folds,
                 loss_function=None):

        self.num_folds = num_folds
        assert num_folds >= 1

        self.use_crf = config.use_crf

        vocab = Vocab.from_files(
            [config.dataset_path, config.test_dataset_path],
            store=config.mapping_file)

        #self.train_dataset = ReviewDataset(config.dataset_path, preprocessed= False, vocab= vocab)
        #self.test_dataset = ReviewDataset(config.test_dataset_path, preprocessed= False, vocab= vocab)

        #self.model = model( vocab, embedding_path= config.word_embedding_path, use_crf= config.use_crf ).to(config.device)

        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.model = model
        self.optimizer = optimizer(self.model.parameters())

        if not self.use_crf and loss_function is None:
            raise Exception(
                ' Loss function must be specified when crf is not being used ')

        self.device = torch.device(
            config.device if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

        print('using device: ', self.device)
Example #8
0
        'adam': torch.optim.Adam,  # default lr=0.001
        'adamax': torch.optim.Adamax,  # default lr=0.002
        'asgd': torch.optim.ASGD,  # default lr=0.01
        'rmsprop': torch.optim.RMSprop,  # default lr=0.01
        'sgd': torch.optim.SGD,
    }

    models = {
        'lstm': LSTM,
        'attention_lstm': AttentionAspectExtraction,
        'global_attention_lstm': GlobalAttentionAspectExtraction,
        'hsan': HSAN,
        'decnn': DECNN
    }

    vocab = Vocab.from_files([config.dataset_path, config.test_dataset_path],
                             store=config.mapping_file)
    train_dataset = ReviewDataset(config.dataset_path,
                                  preprocessed=False,
                                  vocab=vocab)
    test_dataset = ReviewDataset(config.test_dataset_path,
                                 preprocessed=False,
                                 vocab=vocab)

    network = models[config.model](vocab,
                                   embedding_path=config.word_embedding_path,
                                   lambda1=config.lambda1,
                                   use_crf=config.use_crf).to(config.device)
    trainer = Trainer(network,
                      optimizers[config.optimizer],
                      train_dataset,
                      test_dataset,
Example #9
0
        return label_prob, label_pred

    def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
        # (batch_size,sequence_len,hidden_dim)
        rnn_out = self.lstm.get_all_atthiddens(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
        # (batch_size,sequence_len,num_labels+2)
        label_score = self.hidden2tag(rnn_out)
        label_score = self.dropfinal(label_score)
        return label_score
    
if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset
    train_file='/media/data/NER/conll03/conll03/train.bmes'
    dev_file='/media/data/NER/conll03/conll03/dev.bmes'
    test_file='/media/data/NER/conll03/conll03/test.bmes'
    vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False)
    vocab.build([train_file, dev_file, test_file])
    
    
    word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True)
    tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True)
    train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx)
    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data=[]
    label_ids = []
    for words, labels in train_iters:
        char_ids, word_ids = zip(*words)
        data.append(words)
        word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32)
        char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32)
        label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
Example #10
0
def main(opts):

    if len(opts) == 0:
        raise ValueError("Usage: build_data.py <dataset>")
    dataset = opts[0]
    if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']:
        raise ValueError(
            "Dataset must be either cateringServices, automotiveEngineering, or bbn."
        )

    cf.load_config(dataset)
    global MAX_SENT_LEN
    MAX_SENT_LEN = cf.MAX_SENT_LEN

    dataset_filenames = {
        "train": cf.TRAIN_FILENAME,
        "dev": cf.DEV_FILENAME,
    }

    # 1. Construct the Hierarchy by looking through each dataset for unique labels.
    hierarchy = build_hierarchy(dataset_filenames)

    # 2. Construct two empty Vocab objects (one for words, another for wordpieces), which will be populated in step 3.
    word_vocab = Vocab()
    wordpiece_vocab = Vocab()

    logger.info("Hierarchy contains %d categories unique to the test set." %
                len(hierarchy.get_categories_unique_to_test_dataset()))

    # 3. Build a data loader for each dataset (train, test).
    data_loaders = {}
    for ds_name, filepath in dataset_filenames.items():
        logger.info("Loading %s dataset from %s." % (ds_name, filepath))
        dataset, sentences, total_wordpieces = build_dataset(
            filepath, hierarchy, word_vocab, wordpiece_vocab, ds_name)
        if ds_name == "dev":
            batch_size = 1
        else:
            batch_size = cf.BATCH_SIZE
        data_loader = DataLoader(dataset,
                                 batch_size=batch_size,
                                 pin_memory=True)
        data_loaders[ds_name] = data_loader
        logger.info("The %s dataset was built successfully." % ds_name)

        logger.info(
            "Dataset contains %i wordpieces (including overly long sentences)."
            % total_wordpieces)
        if ds_name == "train":
            total_wordpieces_train = total_wordpieces

    BYPASS_SAVING = False
    if BYPASS_SAVING:
        logger.info("Bypassing file saving - training model directly")
        train_without_loading(data_loaders, word_vocab, wordpiece_vocab,
                              hierarchy, total_wordpieces_train)
        return

    logger.info("Saving data loaders to file...")

    dutils.save_obj_to_pkl_file(data_loaders, 'data loaders',
                                cf.ASSET_FOLDER + '/data_loaders.pkl')

    logger.info("Saving vocabs and hierarchy to file...")
    dutils.save_obj_to_pkl_file(word_vocab, 'word vocab',
                                cf.ASSET_FOLDER + '/word_vocab.pkl')
    dutils.save_obj_to_pkl_file(wordpiece_vocab, 'wordpiece vocab',
                                cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
    dutils.save_obj_to_pkl_file(hierarchy, 'hierarchy',
                                cf.ASSET_FOLDER + '/hierarchy.pkl')

    dutils.save_obj_to_pkl_file(total_wordpieces_train, 'total_wordpieces',
                                cf.ASSET_FOLDER + '/total_wordpieces.pkl')

    dutils.save_list_to_file(word_vocab.ix_to_token, 'word vocab',
                             cf.DEBUG_FOLDER + '/word_vocab.txt')
    dutils.save_list_to_file(wordpiece_vocab.ix_to_token, 'wordpiece vocab',
                             cf.DEBUG_FOLDER + '/wordpiece_vocab.txt')
Example #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', help="dataset path", type=str, default=None)
    parser.add_argument('--name',
                        help="name of dataset",
                        type=str,
                        default=None)
    parser.add_argument('--data_indice',
                        help="indices of dataset",
                        type=str,
                        default=None)
    parser.add_argument('--adjacency',
                        help="use adjacency matrix",
                        type=bool,
                        default=False)
    parser.add_argument('--batch', help="batch size", type=int, default=128)
    parser.add_argument('--embed_size',
                        help="embedding vector size",
                        type=int,
                        default=1024)
    parser.add_argument('--seq', help="sequence length", type=int, default=256)
    parser.add_argument('--layers',
                        help="number of layers",
                        type=int,
                        default=6)
    parser.add_argument('--nhead', help="number of head", type=int, default=4)
    parser.add_argument('--saved_model',
                        help="dir of fine-tuned model",
                        type=str)
    parser.add_argument('--matrix_position',
                        help="position of adjacency matrix",
                        type=str,
                        default='atom')
    parser.add_argument('--num_workers',
                        help="number of workers",
                        type=int,
                        default=0)
    parser.add_argument("--seed", type=int, default=7)
    parser.add_argument('--type', type=str)
    #parser.add_argument('--type', help="type of dataset", type=str)
    arg = parser.parse_args()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu")
    print("device:", device)
    Smiles_vocab = Vocab()
    if arg.type == 'zinc':
        testdataset = SmilesDataset(arg.path,
                                    Smiles_vocab,
                                    seq_len=arg.seq,
                                    mat_position=arg.matrix_position)
    else:
        testdataset = ADMETDataset(arg.path,
                                   arg.name,
                                   Smiles_vocab,
                                   seq_len=arg.seq,
                                   trainType='Training',
                                   mat_position=arg.matrix_position)
    test_dataloader = DataLoader(testdataset,
                                 batch_size=arg.batch,
                                 num_workers=arg.num_workers)

    model = Smiles_BERT(len(Smiles_vocab),
                        max_len=arg.seq,
                        nhead=arg.nhead,
                        model_dim=arg.embed_size,
                        nlayers=arg.layers,
                        adj=arg.adjacency)
    value_layer = nn.Linear(arg.embed_size, 1)
    mask_layer = Masked_prediction(arg.embed_size, len(Smiles_vocab))
    model = BERT_double_tasks(model, value_layer, mask_layer)

    model.load_state_dict(torch.load(arg.saved_model))
    model.to(device)
    #if torch.cuda.device_count() > 1:
    #	model = nn.DataParallel(model)

    correct = 0
    total = 0
    predicted_list = np.array([])
    target_list = np.array([])
    total_loss = 0

    criterion = nn.L1Loss()

    model.eval()
    test_iter = tqdm.tqdm(enumerate(test_dataloader),
                          total=len(test_dataloader))
    position_num = torch.arange(arg.seq).repeat(arg.batch, 1).to(device)

    with torch.no_grad():
        for i, data in test_iter:
            data = {key: value.to(device) for key, value in data.items()}
            if data["smiles_bert_input"].size(0) != arg.batch:
                position_num = torch.arange(arg.seq).repeat(
                    data["smiles_bert_input"].size(0), 1).to(device)
            if arg.adjacency is True:
                qed_output, output = model(
                    data["smiles_bert_input"],
                    position_num,
                    adj_mask=data["smiles_bert_adj_mask"],
                    adj_mat=data["smiles_bert_adjmat"])
            else:
                qed_output, output = model(data["smiles_bert_input"],
                                           position_num)
            #output = output[:,0]
            loss = criterion(qed_output, data["smiles_bert_value"].view(-1, 1))
            total_loss += loss.item()
            predicted = output.argmax(dim=-1)
            #print(predicted, data["smiles_bert_label"].shape)
            for k in range(predicted.size(0)):
                for j in range(predicted.size(1)):
                    if data["smiles_bert_label"][k][j].item() != 0:
                        correct += predicted[k][j].eq(
                            data["smiles_bert_label"][k]
                            [j].item()).sum().item()
                        total += 1

            #predicted_list = np.append(predicted_list, predicted.cpu().detach().numpy())
            #target_list = np.append(target_list, data["smiles_bert_label"].cpu().detach().numpy())
            #_, predicted = torch.max(output.data, 1)

            #total += data["smiles_bert_label"].size(0)
            #correct += (torch.round(predicted) == data["smiles_bert_label"]).sum().item()

    #predicted_list = np.reshape(predicted_list, (-1))
    #target_list = np.reshape(target_list, (-1))
    #print(predicted_list, target_list)
    print("Accuracy on testset: ", 100 * correct / total, "MAE on QED:",
          total_loss / len(test_iter))
if __name__ == '__main__':
    vocab_num = 100000
    pubmed_w2v_path = 'pubmed_w2v.txt'
    emb_path = 'emb_cnn.pt'
    opt = Options(config_vocab=False)
    pubmedreader = PubMedReader(opt)
    print('loding text data')
    train_sents, train_labels, test_sents, test_labels, valid_sents, valid_labels = pubmedreader.get_data(
    )

    print('read vocab')
    fixed_vocab_set = read_vocab(pubmed_w2v_path)
    print('fixed vocab set size {}'.format(len(fixed_vocab_set)))
    print('build vocab')
    vocab = Vocab.build_vocab(train_sents, fixed_vocab_set=fixed_vocab_set)
    #
    vocab.append_sents(valid_sents, fixed_vocab_set=fixed_vocab_set)
    vocab.append_sents(test_sents, fixed_vocab_set=fixed_vocab_set)
    #
    print('vocab size {} before shrink'.format(vocab.vocab_len))
    vocab.shrink_vocab(2)
    print('vocab size {} after shrink'.format(vocab.vocab_len))

    print('read vec')
    word_list = [vocab.idx2word[i] for i in range(len(vocab.idx2word))]
    vec = read_vec(pubmed_w2v_path, word_list)
    assert vec.shape[0] == vocab.vocab_len

    print('build emb layer')
    emb = Embedding(vocab.vocab_len,
Example #13
0
# word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
# idx_word = {}

max_story_size = max(map(len, (s for s, _, _ in data)))
mean_story_size = int(np.mean([len(s) for s, _, _ in data]))
sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data)))
query_size = max(map(len, (q for _, q, _ in data)))
answer_size = max(map(len, (a for _, _, a in data)))
del data
sentence_size = max(query_size, sentence_size, answer_size)  # for the position
sentence_size += 1  # +1 for time words +1 for go +1 for eos

memory_size = min(FLAGS.memory_size,
                  max_story_size)  #+ FLAGS.additional_info_memory_size
vocab = Vocab()
vocab.add_vocab(words)
# for i in range(memory_size):
#     vocab.word_to_index('time{}'.format(i + 1))

S, Q, A, A_fact, A_weight = vectorize_data(train,
                                           vocab,
                                           sentence_size,
                                           memory_size,
                                           fact=FLAGS.model_type)
# Add time words/indexes

additional_vocab_size = 50  # for additional infor from knowledge base
vocab_size = vocab.vocab_size  #+ additional_vocab_size  # +1 for nil word

# sentence_size= max(sentence_size,20) # set the same certain length for decoder
Example #14
0
        return batch_loss

    def inference(self, label_score, k=1):
        if self.num_labels > 2:
            label_prob = F.softmax(label_score, dim=-1)
            label_prob, label_pred = label_prob.data.topk(k)
        else:
            label_prob = torch.sigmoid(label_score.squeeze())
            label_pred = (label_prob >= 0.5).data.long()
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Txtfile
    filename = "../data/train.txt"
    vocab = Vocab(wl_th=None, cutoff=2)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=4)
    data = []
    label_ids = []
    for words, labels in train_iters:
        data.append(words)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words,
Example #15
0
def main():
    config = Config()
    vocab = Vocab(config.dict_file)
    q, c, s, spans, s_idx, answerable = load_data(config.train_file, vocab,
                                                  config.debug)
    dev_q, dev_c, dev_s, dev_spans, dev_s_idx, dev_answerable = load_data(
        config.dev_file, vocab, config.debug)
    train_data = list(zip(q, c, s, s_idx, answerable, spans))
    dev_data = list(
        zip(dev_q, dev_c, dev_s, dev_s_idx, dev_answerable, dev_spans))
    ssnet = SSQANet(config)
    ssnet.build_model()
    best_score = 0
    for i in range(config.num_epochs):
        epoch = i + 1
        batches = batch_loader(train_data, config.batch_size, shuffle=False)
        for batch in batches:
            batch_q, batch_c, batch_s, batch_s_idx, batch_ans, batch_spans = zip(
                *batch)
            question_lengths, padded_q = zero_padding(batch_q, level=1)
            context_lengths, padded_c = zero_padding(batch_c, level=1)
            sequence_lengths, sentence_lengths, padded_s = zero_padding(
                batch_s, level=2)
            loss, acc, pred, step = ssnet.train(padded_q, question_lengths,
                                                padded_c, context_lengths,
                                                padded_s, sequence_lengths,
                                                sentence_lengths, batch_s_idx,
                                                batch_ans, batch_spans,
                                                config.dropout)
            train_batch_acc, train_batch_em, train_batch_loss = ssnet.eval(
                padded_q, question_lengths, padded_c, context_lengths,
                padded_s, sequence_lengths, sentence_lengths, batch_s_idx,
                batch_ans, batch_spans)
            if step % 100 == 0:
                print("epoch: %d, step:%d, loss:%.4f, acc:%.2f, em:%.2f" %
                      (epoch, step, loss, train_batch_acc, train_batch_em))

            if step % 1000 == 0:
                dev_batches = batch_loader(dev_data,
                                           config.batch_size,
                                           shuffle=False)
                total_em = []
                total_acc = []
                total_loss = []
                for dev_batch in dev_batches:
                    dev_batch_q, dev_batch_c, dev_batch_s, \
                    dev_batch_s_idx, dev_batch_ans, dev_batch_spans = zip(*dev_batch)
                    question_lengths, padded_q = zero_padding(dev_batch_q,
                                                              level=1)
                    context_lengths, padded_c = zero_padding(dev_batch_c,
                                                             level=1)
                    sequence_lengths, sentence_lengths, padded_s = zero_padding(
                        dev_batch_s, level=2)
                    dev_batch_acc, dev_batch_em, dev_batch_loss = ssnet.eval(
                        padded_q, question_lengths, padded_c, context_lengths,
                        padded_s, sequence_lengths, sentence_lengths,
                        dev_batch_s_idx, dev_batch_ans, dev_batch_spans)

                    total_loss.append(dev_batch_loss)
                    total_em.append(dev_batch_em)
                    total_acc.append(dev_batch_acc)
                dev_em = np.mean(total_em)
                dev_acc = np.mean(total_acc)
                dev_loss = np.mean(total_loss)
                ssnet.write_summary(dev_acc, dev_em, dev_loss, mode="dev")
                ssnet.write_summary(train_batch_acc,
                                    train_batch_em,
                                    train_batch_loss,
                                    mode="train")
                print("after %d step, dev_em:%.2f" % (step, dev_em))
                if dev_em > best_score:
                    best_score = dev_em
                    print("new score! em: %.2f, acc:%.2f" % (dev_em, dev_acc))
                    ssnet.save_session(config.dir_model)
Example #16
0
        distance = 1 + pred_score - y_score.view(-1, 1)
        abs_distance = torch.max(distance, torch.zeros_like(distance))
        ranking = abs_distance.sum(-1)
        reg = self.regularized()
        return ranking.mean() + reg


if __name__ == "__main__":
    import random
    from data_utils import Data2tensor, Vocab, seqPAD, Txtfile, PADt, Embeddings
    Data2tensor.set_randseed(1234)
    use_cuda = torch.cuda.is_available()
    filename = "/media/data/restaurants/yelp_dataset/processed/extracted_rev/yelp_data_rev.pro.txt"
    idf_file = "./idf.txt"

    vocab = Vocab(wl_th=None, wcutoff=5)
    vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000)

    word2idx = vocab.wd2idx(vocab_words=vocab.w2i,
                            unk_words=True,
                            se_words=False)

    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         limit=100000)

    batch_size = 8
    neg_sampling = 5
    no_chunks = batch_size * (neg_sampling + 1)
    train_iters = Vocab.minibatches(train_data, batch_size=no_chunks)
import numpy as np
import json

from data_utils import Vocab

vocab = {}
vectors = []
index = 0

train_dataset = './datasets/Restaurants_Train.xml'
test_dataset = './datasets/Restaurants_Test.xml'
mapping_file = './embeddings/restaurant_mapping.json'
vocab = Vocab.from_files([train_dataset, test_dataset],
                         store=mapping_file).get_vocab()

embedding = np.zeros((len(vocab), 200))

with open('embeddings/glove/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        if word in vocab:
            vector = np.asarray(values[1:])
            embedding[vocab[word], :100] = vector

print('glove done')

with open('embeddings/domain_embedding/restaurant_emb.vec',
          'r',
          encoding='utf-8') as f:
    for line in f:
Example #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', help="dataset path", type=str, default=None)
    parser.add_argument('--dataset', help="name of dataset", type=str)
    #parser.add_argument('--data_indice', help="indices of dataset", type=str)
    parser.add_argument('--adjacency',
                        help="use adjacency matrix",
                        type=bool,
                        default=False)
    parser.add_argument('--batch', help="batch size", type=int, default=128)
    parser.add_argument('--epoch', help="epoch", type=int, default=100)
    parser.add_argument('--seq', help="sequence length", type=int, default=256)
    parser.add_argument('--lr',
                        help="learning rate",
                        type=float,
                        default=0.0001)
    parser.add_argument('--embed_size',
                        help="embedding vector size",
                        type=int,
                        default=1024)
    parser.add_argument('--model_dim',
                        help="dim of transformer",
                        type=int,
                        default=1024)
    parser.add_argument('--layers',
                        help="number of layers",
                        type=int,
                        default=6)
    parser.add_argument('--nhead', help="number of head", type=int, default=4)
    parser.add_argument('--drop_rate',
                        help="ratio of dropout",
                        type=float,
                        default=0)
    parser.add_argument('--matrix_position',
                        help="position of adjacency matrix",
                        type=str,
                        default='atom')
    parser.add_argument('--warmup_step',
                        help="warmup step for scheduled learning rate",
                        type=int,
                        default=10000)
    parser.add_argument('--num_workers',
                        help="number of workers",
                        type=int,
                        default=0)
    parser.add_argument('--split',
                        help="type of dataset",
                        type=str,
                        default='scaffold')
    parser.add_argument('--saved_model',
                        help="dir of pre-trained model",
                        type=str)
    parser.add_argument("--seed", type=int, default=7)
    arg = parser.parse_args()

    _init_seed_fix(arg.seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)

    if arg.dataset == "tox21":
        num_tasks = 12
    elif arg.dataset == "bace":
        num_tasks = 1
    elif arg.dataset == 'bbbp':
        num_tasks = 1
    elif arg.dataset == 'clintox':
        num_tasks = 2
    elif arg.dataset == 'sider':
        num_tasks = 27
    elif arg.dataset == 'toxcast':
        num_tasks = 617
    elif arg.dataset == 'muv':
        num_tasks = 17
    elif arg.dataset == 'hiv':
        num_tasks = 1

    Smiles_vocab = Vocab()
    # read data
    dataset = FinetuningDataset(arg.path,
                                arg.dataset,
                                Smiles_vocab,
                                seq_len=arg.seq,
                                trainType='Training',
                                mat_position=arg.matrix_position)
    print("Dataset loaded")
    if arg.split == 'scaffold':
        smiles_csv = pd.read_csv(arg.path + "/" + arg.dataset + ".csv",
                                 sep=',')
        smiles_list = smiles_csv['smiles'].tolist()

        train_idx, valid_idx, test_idx = scaffold_split(smiles_list)
    elif arg.split == 'random_scaffold':
        smiles_list = smiles_csv['smiles'].tolist()

        train_idx, valid_idx, test_idx = random_scaffold(smiles_list, arg.seed)
    else:
        indices = list(range(len(dataset)))
        split1, split2 = int(np.floor(0.1 * len(dataset))), int(
            np.floor(0.2 * len(dataset)))
        #np.random.seed(arg.seed)
        np.random.shuffle(indices)
        train_idx, valid_idx, test_idx = indices[split2:], indices[
            split1:split2], indices[:split1]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    test_sampler = SubsetRandomSampler(test_idx)

    # preprocessing - dataloader(train, valid, test)
    train_dataloader = DataLoader(dataset,
                                  batch_size=arg.batch,
                                  sampler=train_sampler,
                                  num_workers=arg.num_workers,
                                  pin_memory=True)
    valid_dataloader = DataLoader(dataset,
                                  batch_size=arg.batch,
                                  sampler=valid_sampler,
                                  num_workers=arg.num_workers)
    test_dataloader = DataLoader(dataset,
                                 batch_size=arg.batch,
                                 sampler=test_sampler,
                                 num_workers=arg.num_workers)

    model = Smiles_BERT(len(Smiles_vocab),
                        max_len=arg.seq,
                        nhead=arg.nhead,
                        feature_dim=arg.embed_size,
                        feedforward_dim=arg.model_dim,
                        nlayers=arg.layers,
                        adj=arg.adjacency,
                        dropout_rate=arg.drop_rate)
    model.load_state_dict(torch.load(arg.saved_model))
    output_layer = nn.Linear(arg.embed_size, num_tasks)

    model = BERT_base(model, output_layer)
    #model = BERT_base_dropout(model, output_layer)

    model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    #model.to(device)

    optim = Adam(model.parameters(), lr=arg.lr, weight_decay=0)
    criterion = nn.BCEWithLogitsLoss(reduction='none')
    # load model
    print("Start fine-tuning with seed", arg.seed)
    min_valid_loss = 100000
    counter = 0

    for epoch in range(arg.epoch):
        avg_loss = 0
        valid_avg_loss = 0
        total_hit = 0
        total = 0

        data_iter = tqdm.tqdm(enumerate(train_dataloader),
                              total=len(train_dataloader))
        #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device)
        model.train()
        for i, data in data_iter:
            data = {key: value.to(device) for key, value in data.items()}
            position_num = torch.arange(arg.seq).repeat(
                data["smiles_bert_input"].size(0), 1).to(device)
            if arg.adjacency is True:
                output = model.forward(data["smiles_bert_input"],
                                       position_num,
                                       adj_mask=data["smiles_bert_adj_mask"],
                                       adj_mat=data["smiles_bert_adjmat"])
            else:
                output = model.forward(data["smiles_bert_input"], position_num)
            output = output[:, 0]
            data["smiles_bert_label"] = data["smiles_bert_label"].view(
                output.shape).to(torch.float64)
            is_valid = data["smiles_bert_label"]**2 > 0

            loss = criterion(output.double(),
                             (data["smiles_bert_label"] + 1) / 2)
            loss = torch.where(
                is_valid, loss,
                torch.zeros(loss.shape).to(loss.device).to(loss.dtype))
            optim.zero_grad()
            loss = torch.sum(loss) / torch.sum(is_valid)
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            optim.step()

            avg_loss += loss.item()
            status = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }
            if i % 100 == 0:
                print(i)
                #data_iter.write(str(status))
        print("Epoch: ", epoch, "average loss: ", avg_loss / len(data_iter))

        model.eval()
        valid_iter = tqdm.tqdm(enumerate(valid_dataloader),
                               total=len(valid_dataloader))
        #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device)
        predicted_list = []
        target_list = []

        with torch.no_grad():
            for i, data in valid_iter:
                data = {key: value.to(device) for key, value in data.items()}
                position_num = torch.arange(arg.seq).repeat(
                    data["smiles_bert_input"].size(0), 1).to(device)
                if arg.adjacency is True:
                    output = model.forward(
                        data["smiles_bert_input"],
                        position_num,
                        adj_mask=data["smiles_bert_adj_mask"],
                        adj_mat=data["smiles_bert_adjmat"])
                else:
                    output = model.forward(data["smiles_bert_input"],
                                           position_num)
                output = output[:, 0]
                data["smiles_bert_label"] = data["smiles_bert_label"].view(
                    output.shape).to(torch.float64)
                is_valid = data["smiles_bert_label"]**2 > 0
                valid_loss = criterion(output.double(),
                                       (data["smiles_bert_label"] + 1) / 2)
                valid_loss = torch.where(
                    is_valid, valid_loss,
                    torch.zeros(valid_loss.shape).to(valid_loss.device).to(
                        valid_loss.dtype))
                valid_loss = torch.sum(valid_loss) / torch.sum(is_valid)

                valid_avg_loss += valid_loss.item()
                predicted = torch.sigmoid(output)
                predicted_list.append(predicted)
                target_list.append(data["smiles_bert_label"])

                #_, predicted = torch.max(output.data, 1)

                #total += data["smiles_bert_label"].size(0)
                #total_hit += (torch.round(predicted) == data["smiles_bert_label"]).sum().item()
        predicted_list = torch.cat(predicted_list, dim=0).cpu().numpy()
        target_list = torch.cat(target_list, dim=0).cpu().numpy()
        #predicted_list = np.reshape(predicted_list, -1)
        #target_list = np.reshape(target_list, -1)
        roc_list = []
        for i in range(target_list.shape[1]):
            if np.sum(target_list[:, i] == 1) > 0 and np.sum(
                    target_list[:, i] == -1) > 0:
                is_valid = target_list[:, i]**2 > 0
                roc_list.append(
                    roc_auc_score((target_list[is_valid, i] + 1) / 2,
                                  predicted_list[is_valid, i]))

        print("AUCROC: ", sum(roc_list) / len(roc_list))

        if valid_avg_loss < min_valid_loss:
            save_path = "../finetuned_model/" + str(
                arg.dataset) + "_epoch_" + str(epoch) + "_val_loss_" + str(
                    round(valid_avg_loss / len(valid_dataloader), 5))
            torch.save(model.state_dict(), save_path + '.pt')
            model.to(device)
            min_valid_loss = valid_avg_loss
            counter = 0

        counter += 1
        if counter > 5:
            break

    # eval
    print("Finished. Start evaluation.")
    correct = 0
    total = 0
    predicted_list = []
    target_list = []

    model.eval()
    #test_iter = tqdm.tqdm(enumerate(test_dataloader), total=len(test_dataloader))
    #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device)
    with torch.no_grad():
        for i, data in enumerate(test_dataloader):
            data = {key: value.to(device) for key, value in data.items()}
            position_num = torch.arange(arg.seq).repeat(
                data["smiles_bert_input"].size(0), 1).to(device)
            if arg.adjacency is True:
                output = model(data["smiles_bert_input"],
                               position_num,
                               adj_mask=data["smiles_bert_adj_mask"],
                               adj_mat=data["smiles_bert_adjmat"])
            else:
                output = model(data["smiles_bert_input"], position_num)
            output = output[:, 0]
            data["smiles_bert_label"] = data["smiles_bert_label"].view(
                output.shape).to(torch.float64)
            predicted = torch.sigmoid(output)
            predicted_list.append(predicted)
            target_list.append(data["smiles_bert_label"])

            #_, predicted = torch.max(output.data, 1)

            #total += data["smiles_bert_label"].size(0)
            #correct += (torch.round(predicted) == data["smiles_bert_label"]).sum().item()
        predicted_list = torch.cat(predicted_list, dim=0).cpu().numpy()
        target_list = torch.cat(target_list, dim=0).cpu().numpy()
        #predicted_list = np.reshape(predicted_list, -1)
        #target_list = np.reshape(target_list, -1)
        roc_list = []
        for i in range(target_list.shape[1]):
            if np.sum(target_list[:, i] == 1) > 0 and np.sum(
                    target_list[:, i] == -1) > 0:
                is_valid = target_list[:, i]**2 > 0
                roc_list.append(
                    roc_auc_score((target_list[is_valid, i] + 1) / 2,
                                  predicted_list[is_valid, i]))

        print("AUCROC: ", sum(roc_list) / len(roc_list))
    print("Evaluate on min valid loss model")
    correct = 0
    total = 0
    predicted_list = []
    target_list = []
    model.load_state_dict(torch.load(save_path + '.pt'))
    model.eval()
    #test_iter = tqdm.tqdm(enumerate(test_dataloader), total=len(test_dataloader))
    #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device)
    with torch.no_grad():
        for i, data in enumerate(test_dataloader):
            data = {key: value.to(device) for key, value in data.items()}
            position_num = torch.arange(arg.seq).repeat(
                data["smiles_bert_input"].size(0), 1).to(device)
            if arg.adjacency is True:
                output = model(data["smiles_bert_input"],
                               position_num,
                               adj_mask=data["smiles_bert_adj_mask"],
                               adj_mat=data["smiles_bert_adjmat"])
            else:
                output = model(data["smiles_bert_input"], position_num)
            output = output[:, 0]
            data["smiles_bert_label"] = data["smiles_bert_label"].view(
                output.shape).to(torch.float64)
            predicted = torch.sigmoid(output)
            predicted_list.append(predicted)
            target_list.append(data["smiles_bert_label"])
            #_, predicted = torch.max(output.data, 1)

            #total += data["smiles_bert_label"].size(0)
            #correct += (torch.round(predicted) == data["smiles_bert_label"]).sum().item()

        #predicted_list = np.reshape(predicted_list, -1)
        #target_list = np.reshape(target_list, -1)
        predicted_list = torch.cat(predicted_list, dim=0).cpu().numpy()
        target_list = torch.cat(target_list, dim=0).cpu().numpy()
        roc_list = []
        for i in range(target_list.shape[1]):
            if np.sum(target_list[:, i] == 1) > 0 and np.sum(
                    target_list[:, i] == -1) > 0:
                is_valid = target_list[:, i]**2 > 0
                roc_list.append(
                    roc_auc_score((target_list[is_valid, i] + 1) / 2,
                                  predicted_list[is_valid, i]))

        print("AUCROC: ", sum(roc_list) / len(roc_list))
Example #19
0
        # Calculate un-nomalized scores
        decoded_scores = self.scorer_layer(h_n_drop)
        # YOUR CODE ENDS HERE
        #######################
        return decoded_scores, rec_hidden, rec_output


if __name__ == '__main__':
    from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD
    cutoff = 5
    wl_th = -1
    batch_size = 16

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_files = ["../dataset/train.small.txt"]
    vocab = Vocab(wl_th=wl_th, cutoff=cutoff)
    vocab.build(data_files, firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    label2idx = vocab.tag2idx(vocab.l2i)

    rec_type = "LSTM"
    ntoken = len(vocab.w2i)
    nlabels = len(vocab.l2i)
    emb_size = 50
    hidden_size = 64
    nlayers = 2
    dropout = 0.5
    bidirect = False

    #embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size)
    #print(embedding_matrix[5])
def main():

    # The dataset filenames are stored as a dictionary, e.g.
    # "train": "data/bbn/train.json",
    # "dev": "data/bbn/dev.json"... etc
    dataset_filenames = {
        "train": cf.TRAIN_FILENAME,
        "dev": cf.DEV_FILENAME,
        "test": cf.TEST_FILENAME,
    }

    # 1. Construct the Hierarchy by looking through each dataset for unique labels.
    hierarchy = build_hierarchy(dataset_filenames)

    # 2. Construct two empty Vocab objects (one for words, another for wordpieces), which will be populated in step 3.
    word_vocab = Vocab()
    wordpiece_vocab = Vocab()

    logger.info("Hierarchy contains %d categories unique to the test set." %
                len(hierarchy.get_categories_unique_to_test_dataset()))

    # 3. Build a data loader for each dataset (train, test).
    # A 'data loader' is an Pytorch object that stores a dataset in a numeric format.
    data_loaders = {}
    # Iterate over each of the train, dev and test datasets.
    for ds_name, filepath in dataset_filenames.items():
        logger.info("Loading %s dataset from %s." % (ds_name, filepath))
        dataset, total_wordpieces = build_dataset(filepath, hierarchy,
                                                  word_vocab, wordpiece_vocab,
                                                  ds_name)
        data_loader = DataLoader(dataset,
                                 batch_size=cf.BATCH_SIZE,
                                 pin_memory=True)
        data_loaders[ds_name] = data_loader
        logger.info("The %s dataset was built successfully." % ds_name)

        logger.info(
            "Dataset contains %i wordpieces (including overly long sentences)."
            % total_wordpieces)
        if ds_name == "train":
            total_wordpieces_train = total_wordpieces

    print(hierarchy.category_counts['train'])

    # This part is not necessary (it was added so that I didn't have to save the huge Wiki dataset to disk).
    # If BYPASS_SAVING is set to true, the model will start training and the data loaders will not be saved onto the harddrive.
    BYPASS_SAVING = False
    if BYPASS_SAVING:
        logger.info("Bypassing file saving - training model directly")
        train_without_loading(data_loaders, word_vocab, wordpiece_vocab,
                              hierarchy, total_wordpieces_train)
        #return
        logger.info("Evaluating directly")
        evaluate_without_loading(data_loaders, word_vocab, wordpiece_vocab,
                                 hierarchy, total_wordpieces_train)
        return

    # This part saves every data loader into the asset directory, so that they can be read during training.
    logger.info("Saving data loaders to file...")

    dutils.save_obj_to_pkl_file(data_loaders, 'data loaders',
                                cf.ASSET_FOLDER + '/data_loaders.pkl')

    logger.info("Saving vocabs and hierarchy to file...")
    dutils.save_obj_to_pkl_file(word_vocab, 'word vocab',
                                cf.ASSET_FOLDER + '/word_vocab.pkl')
    dutils.save_obj_to_pkl_file(wordpiece_vocab, 'wordpiece vocab',
                                cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
    dutils.save_obj_to_pkl_file(hierarchy, 'hierarchy',
                                cf.ASSET_FOLDER + '/hierarchy.pkl')

    dutils.save_obj_to_pkl_file(total_wordpieces_train, 'total_wordpieces',
                                cf.ASSET_FOLDER + '/total_wordpieces.pkl')

    dutils.save_list_to_file(word_vocab.ix_to_token, 'word vocab',
                             cf.DEBUG_FOLDER + '/word_vocab.txt')
    dutils.save_list_to_file(wordpiece_vocab.ix_to_token, 'wordpiece vocab',
                             cf.DEBUG_FOLDER + '/wordpiece_vocab.txt')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', help="dataset path", type=str, default=None)
    parser.add_argument('--save_path',
                        help="trained model path",
                        type=str,
                        default=None)
    parser.add_argument('--adjacency',
                        help="use adjacency matrix",
                        type=bool,
                        default=False)
    parser.add_argument('--batch', help="batch size", type=int, default=128)
    parser.add_argument('--epoch', help="epoch", type=int, default=50)
    parser.add_argument('--seq', help="sequence length", type=int, default=256)
    parser.add_argument('--lr',
                        help="learning rate",
                        type=float,
                        default=0.0001)
    parser.add_argument('--embed_size',
                        help="embedding vector size",
                        type=int,
                        default=1024)
    parser.add_argument('--model_dim',
                        help="dim of transformer",
                        type=int,
                        default=1024)
    parser.add_argument('--layers',
                        help="number of layers",
                        type=int,
                        default=6)
    parser.add_argument('--nhead', help="number of head", type=int, default=4)
    parser.add_argument('--drop_rate',
                        help="ratio of dropout",
                        type=float,
                        default=0)
    parser.add_argument('--matrix_position',
                        help="position of adjacency matrix",
                        type=str,
                        default='atom')
    parser.add_argument('--warmup_step',
                        help="warmup step for scheduled learning rate",
                        type=int,
                        default=10000)
    parser.add_argument('--num_workers',
                        help="number of workers",
                        type=int,
                        default=0)
    parser.add_argument("--local_rank", type=int, default=-1)
    parser.add_argument("--seed", type=int, default=7)
    #parser.add_argument('--savepath', help="saved model dir", type=str)
    arg = parser.parse_args()

    torch.manual_seed(arg.seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)
    Smiles_vocab = Vocab()
    dataset = SmilesDataset(arg.path,
                            Smiles_vocab,
                            seq_len=arg.seq,
                            mat_position=arg.matrix_position)
    print("Dataset loaded")

    train_dataloader = DataLoader(dataset,
                                  shuffle=True,
                                  batch_size=arg.batch,
                                  num_workers=arg.num_workers,
                                  pin_memory=True)

    model = Smiles_BERT(len(Smiles_vocab),
                        max_len=arg.seq,
                        nhead=arg.nhead,
                        feature_dim=arg.embed_size,
                        feedforward_dim=arg.model_dim,
                        nlayers=arg.layers,
                        adj=arg.adjacency,
                        dropout_rate=arg.drop_rate)
    value_layer = nn.Linear(arg.embed_size, 1)
    mask_layer = Masked_prediction(arg.embed_size, len(Smiles_vocab))
    model = BERT_double_tasks(model, value_layer, mask_layer)
    model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    optim = Adam(model.parameters(), lr=arg.lr, weight_decay=0)
    scheduled_optim = ScheduledOptim(optim,
                                     arg.embed_size,
                                     n_warmup_steps=arg.warmup_step)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    criterion2 = nn.L1Loss()

    print("Start pre-training")
    for epoch in range(arg.epoch):
        avg_loss = 0
        #hit = 0
        #total = 0
        data_iter = tqdm.tqdm(enumerate(train_dataloader),
                              total=len(train_dataloader))
        position_num = torch.arange(arg.seq).repeat(arg.batch, 1).to(device)
        model.train()
        for i, data in data_iter:
            data = {key: value.to(device) for key, value in data.items()}
            if data["smiles_bert_input"].size(0) != arg.batch:
                position_num = torch.arange(arg.seq).repeat(
                    data["smiles_bert_input"].size(0), 1).to(device)
            if arg.adjacency is True:
                value_out, mask_out = model.forward(
                    data["smiles_bert_input"],
                    position_num,
                    adj_mask=data["smiles_bert_adj_mask"],
                    adj_mat=data["smiles_bert_adjmat"])
            else:
                value_out, mask_out = model.forward(data["smiles_bert_input"],
                                                    position_num)
            #print(output.shape, data["smiles_bert_label"].shape)
            #print(output, data["smiles_bert_label"])
            loss = criterion(mask_out.transpose(
                1, 2), data["smiles_bert_label"]) + criterion2(
                    value_out, data["smiles_bert_value"].view(-1, 1))
            scheduled_optim.zero_grad()
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            scheduled_optim.step_and_update_lr()

            avg_loss += loss.item()

            status = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }
            if i % 1000 == 0:
                data_iter.write(str(status))
            if i % 5000 == 0:
                #print()
                torch.save(
                    model.module.state_dict(),
                    str(arg.save_path) + "/temp_model_" + "epoch_" +
                    str(epoch) + "_" + str(i) + "_" +
                    str(round(avg_loss / (i + 1), 5)))
            #hit = output.argmax(dim=-1).eq(data["smiles_bert_label"])

        print("Epoch: ", epoch, "average loss: ", avg_loss / len(data_iter))

        save_path = str(arg.save_path) + "/nlayers_" + str(
            arg.layers) + "_nhead_" + str(arg.nhead) + "_adj_" + str(
                arg.adjacency) + "_epoch_" + str(epoch) + "_loss_" + str(
                    round(avg_loss / len(data_iter), 5))
        torch.save(model.module.bert.state_dict(), save_path + '.pt')
        model.to(device)
        print("model saved")

        correct = 0
        total = 0
        predicted_list = np.array([])
        target_list = np.array([])
        total_loss = 0
        '''
Example #22
0
    def inference(self, label_score, k=1):
        label_prob = F.softmax(label_score, dim=-1)
        label_prob, label_pred = label_prob.data.topk(k)
        return label_prob, label_pred


if __name__ == '__main__':
    from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD
    cutoff = 5
    wl_th = -1
    batch_size = 16
    bptt = 10

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_files = ["../dataset/train.txt"]
    vocab = Vocab(wl_th=wl_th, cutoff=cutoff)
    vocab.build(data_files, firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    label2idx = vocab.tag2idx(vocab.l2i)

    train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx)
    # train_data = [sent[0] for sent in train_data]
    train_batch = vocab.minibatches(train_data, batch_size=batch_size)
    inpdata=[]
    outdata=[]
    for sent in train_batch:
        word_pad_ids, seq_lens = seqPAD.pad_sequences(sent, pad_tok=vocab.w2i[PAD])
        data_tensor = Data2tensor.idx2tensor(word_pad_ids)
        for i in range(0, data_tensor.size(1)-1, bptt):
            data, target = vocab.bptt_batch(data_tensor, i, bptt)
            inpdata.append(data)
Example #23
0
        return batch_loss

    def inference(self, label_score, k=1):
        if self.num_labels > 2:
            label_prob = F.softmax(label_score, dim=-1)
            label_prob, label_pred = label_prob.data.topk(k)
        else:
            label_prob = F.sigmoid(label_score.squeeze())
            label_pred = (label_prob >= 0.5).data.long()
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Csvfile
    filename = "/media/data/langID/small_scale/train.csv"
    vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.c2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Csvfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data = []
    label_ids = []
    for words, labels in train_iters:
        data.append(words)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words,
Example #24
0
class BaseModel():
    def load_data(self, datafile):

        dataset = pd.read_csv(datafile)
        if self.debug:
            dataset = dataset.iloc[:3000]

        text = 'comment_text'
        self.X = dataset[text].values

        labels = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]
        # labels = ['severe_toxic']
        assert (len(labels) == self.config.label_size)
        self.y = dataset[labels].values
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=0.1, random_state=124)

        ## Build the vocabulary using the train data.
        self.vocab = Vocab()
        train_sents = [get_words(line) for line in self.X_train]
        self.vocab.construct(list(itertools.chain.from_iterable(train_sents)),
                             threshold=self.config.min_word_freq)
        print('Training on {} samples and validating on {} samples'.format(
            len(self.X_train), len(self.X_val)))
        print()

        self.embedding_matrix = np.random.uniform(
            -0.005, 0.005, size=[len(self.vocab),
                                 self.config.embed_size]).astype('float32')
        with tf.variable_scope("Embeddings") as scope:
            embedding = tf.get_variable("Embeds",
                                        initializer=self.embedding_matrix,
                                        dtype=tf.float32)

        if self.debug:
            return

        ## Populate embedding matrix from pre-trained word embeddings
        pretrained_index = {}
        with open('./WordVectors/crawl-300d-2M.vec') as fh:
            for line in fh:
                word_vec = line.strip().split()
                word = word_vec[0]
                vector = np.asarray(word_vec[1:], dtype='float32')
                pretrained_index[word] = vector

        pw = 0.0

        for word, idx in self.vocab.word_to_idx.items():
            pretrained_vector = pretrained_index.get(word)
            if pretrained_vector is not None:
                self.embedding_matrix[idx] = pretrained_vector
                pw += 1

        print("Found pretrained vectors for {:.2f}% of data".format(
            pw / len(self.vocab) * 100))
        del pretrained_index  ## Done only for memory constraint. Don't do this!!

    def input_embeddings(self):

        with tf.variable_scope("Embeddings", reuse=True):
            embedding = tf.get_variable("Embeds")

        input_vectors = tf.nn.embedding_lookup(self.embedding_matrix,
                                               self.input_placeholder)
        return input_vectors

    def core_module(self):

        return

    def calculate_loss(self, output):

        labels = self.label_placeholder

        log_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=output,
                                                    labels=labels))

        l2_loss = 0
        for weights in tf.trainable_variables():
            if ("Bias" not in weights.name) and ("Embeddings"
                                                 not in weights.name):
                l2_loss += (self.config.l2 * tf.nn.l2_loss(weights))

        loss = log_loss + l2_loss

        return loss

    def training_operation(self, loss):
        return tf.train.AdamOptimizer(
            learning_rate=self.config.lr).minimize(loss)

    def build_feeddict(self):

        return