Esempio n. 1
0
def create_and_train_network(args):
    name = args['name']
    now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
    LOG_DIR = os.path.join('log', f'{name}-{now}')

    train_data = ReviewDataset('train.npy',
                               test_ratio=0.1,
                               label_smoothing=args['label_smoothing'],
                               output_type=args['output_type'],
                               data_type=args['network_type'])

    if args['num_words']:
        train_data.num_words = args['num_words']
    else:
        train_data.find_out_num_words()

    # Define network
    net = Network(args, num_words=train_data.num_words, logdir=LOG_DIR)

    # LOad network
    if args['load']:
        load_model = args['load']
        net.model.load_weights(f'models/{load_model}')
        net.train_epoch(train_data, args, 1)
        metrics = net.evaluate(train_data, args, 30)
        print(f'Loaded network: {metrics}')

    for epoch in range(args['epochs']):
        net.train_epoch(train_data, args, 100)
        metrics = net.evaluate(train_data, args, 30)
        print(f'Epoch {epoch}:{metrics}')
        # Save checkpoint
        if epoch % 10 == 9:
            net.model.save_weights(f'models/{name}-{epoch}')
    return net
Esempio n. 2
0
def get_dataset(embeddings, paths, val=False):
    """
    Gathers all the review files pathnames, and returns a ReviewDataset object.
    If val == True, splits the data into 90% training data and 10% validation data.
    """
    files = []
    labels = []

    for sentiment in paths:
        files.append([])
        labels.append([])

        for dirpath, dirnames, filenames in os.walk(sentiment):
            for filename in filenames:
                files[-1].append(dirpath + '/' + filename)

                if sentiment == path_neg or sentiment == path_test_neg:
                    labels[-1].append(0)
                elif sentiment == path_pos or sentiment == path_test_pos:
                    labels[-1].append(1)

    if val:
        split = int(len(files[0]) * 0.9)
        train_files = files[0][:split]
        train_files.extend(files[1][:split])
        train_labels = labels[0][:split]
        train_labels.extend(labels[1][:split])
        train_data = ReviewDataset(embeddings, train_files, train_labels)

        val_files = files[0][split:]
        val_files.extend(files[1][split:])
        val_labels = labels[0][split:]
        val_labels.extend(labels[1][split:])
        val_data = ReviewDataset(embeddings, val_files, val_labels)

        return train_data, val_data

    files = [file for files_ in files for file in files_]
    labels = [label for labels_ in labels for label in labels_]

    data = ReviewDataset(embeddings, files, labels)
    return data
Esempio n. 3
0
        cur_idx += 1
    return result


if __name__ == '__main__':
    THRESH = 0.10
    SAVING_DIR = '../models/'
    MODELS = [
        'best_bert_model_774', 'best_bert_model_77', 'best_bert_model_cv0',
        'best_bert_model_cv1', 'best_bert_model_cv2', 'best_bert_model_cv3',
        'best_bert_model_cv4'
    ]
    tokenizer = BertTokenizer.from_pretrained(
        '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch',
        do_lower_case=True)
    test_dataset = ReviewDataset('../data/TEST/Test_reviews.csv', None,
                                 tokenizer)
    test_loader = DataLoader(test_dataset,
                             12,
                             collate_fn=test_dataset.batchify,
                             shuffle=False,
                             num_workers=5)

    ret = None
    for name in MODELS:
        model_path = osp.join(SAVING_DIR, name)
        model = OpinioNet.from_pretrained(
            '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch')
        model.load_state_dict(torch.load(model_path))
        model.cuda()
        ret = accum_result(ret, eval_epoch(model, test_loader))
        del model
Esempio n. 4
0
    MODELS = list(zip(WEIGHT_NAMES, MODEL_NAMES, THRESHS))
    # tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODELS['roberta']['path'], do_lower_case=True)
    # test_dataset = ReviewDataset(args.rv, args.lb, tokenizer, 'laptop')
    # test_loader = DataLoader(test_dataset, args.bs, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5)
    ret = None
    raw = None
    lb = None
    num_model = 0
    for weight_name, model_name, thresh in MODELS:
        if not osp.isfile('../models/' + weight_name):
            continue
        num_model += 1
        model_config = PRETRAINED_MODELS[model_name]
        tokenizer = BertTokenizer.from_pretrained(model_config['path'],
                                                  do_lower_case=True)
        test_dataset = ReviewDataset(args.rv, args.lb, tokenizer, 'laptop')
        test_loader = DataLoader(test_dataset,
                                 args.bs,
                                 collate_fn=test_dataset.batchify,
                                 shuffle=False,
                                 num_workers=5)

        if not raw:
            raw = [s[0][0] for s in test_dataset.samples]
        if not lb and args.lb:
            lb = [s[0][1] for s in test_dataset.samples]

        model = OpinioNet.from_pretrained(model_config['path'],
                                          version=model_config['version'],
                                          focal=model_config['focal'])
        print(weight_name)
Esempio n. 5
0
				if not isbad:
					nmsopns.append(opn)
			results[i] = nmsopns
			# print(results)
		return results


if __name__ == '__main__':
	from pytorch_pretrained_bert import BertTokenizer
	from dataset import ReviewDataset

	tokenizer = BertTokenizer.from_pretrained('/home/zydq/.torch/models/bert/ERNIE',
											  do_lower_case=True)
	model = OpinioNet.from_pretrained('/home/zydq/.torch/models/bert/ERNIE')
	model.cuda()
	model.train()

	d = ReviewDataset('../data/TRAIN/Train_reviews.csv', '../data/TRAIN/Train_labels.csv', tokenizer)
	b_raw, b_in, b_tgt = d.batchify(d[:10])

	for i in range(len(b_in)):
		b_in[i] = b_in[i].cuda()
	for i in range(len(b_tgt)):
		b_tgt[i] = b_tgt[i].cuda()
	print(b_in)
	probs, logits = model.forward(b_in)
	loss = model.loss(logits, b_tgt)
	result = model.nms(probs)
	print(loss)
	print(result)
Esempio n. 6
0
    seed=1337,
    # Runtime options
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
    train=False,
)
# handle dirs
handle_dirs(args.save_dir)

vectorizer_pth = os.path.join(args.save_dir, args.vectorizer_file)
if args.reload_from_files:
    # training from a checkpoint
    print("Loading dataset and vectorizer")
    dataset = ReviewDataset.load_dataset_and_load_vectorizer(
        args.review_csv, vectorizer_pth)
else:
    print("Loading dataset and creating vectorizer")
    # create dataset and vectorizer
    dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
    dataset.save_vectorizer(vectorizer_pth)

vectorizer = dataset.get_vectorizer()

classifier = ReviewPerceptronClassifier(num_features=len(
    vectorizer.review_vocab),
                                        num_classes=1)
# classifier = ReviewMLPClassifier(num_features=len(vectorizer.review_vocab), num_classes=1, hidden_layer_dim=[100])

args.classifier = classifier
args.vectorizer = vectorizer
Esempio n. 7
0
		LABEL_DIR = '../data/TRAIN/Train_laptop_corpus_labels.csv'
		SUBMIT_DIR = None


	with open(THRESH_DIR, 'r', encoding='utf-8') as f:
		thresh_dict = json.load(f)

	WEIGHT_NAMES, MODEL_NAMES, THRESHS = [], [], []
	for k, v in thresh_dict.items():
		WEIGHT_NAMES.append(k)
		MODEL_NAMES.append(v['name'])
		THRESHS.append(v['thresh'])

	MODELS = list(zip(WEIGHT_NAMES, MODEL_NAMES, THRESHS))
	tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODELS['roberta']['path'], do_lower_case=True)
	test_dataset = ReviewDataset(DATA_DIR, None, tokenizer, 'laptop')
	test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5)
	ret = None
	num_model = 0
	for weight_name, model_name, thresh in MODELS:
		if not osp.isfile('../models/' + weight_name):
			continue
		num_model += 1
		model_config = PRETRAINED_MODELS[model_name]
		tokenizer = BertTokenizer.from_pretrained(model_config['path'], do_lower_case=True)
		test_dataset = ReviewDataset(DATA_DIR, None, tokenizer, 'laptop')
		test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5)
		print(model_config)
		model = OpinioNet.from_pretrained(model_config['path'], version=model_config['version'], focal=model_config['focal'])
		model.load_state_dict(torch.load('../models/' + weight_name))
		model.cuda()
Esempio n. 8
0
                    },
                    ignore_index=True)
            cur_idx += 1

        step += 1
    return result


if __name__ == '__main__':
    EP = 100
    SAVING_DIR = '../models/'
    tokenizer = BertTokenizer.from_pretrained(
        '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch',
        do_lower_case=True)
    test_dataset = ReviewDataset('../data/TEST/Test_reviews.csv',
                                 None,
                                 tokenizer,
                                 type='laptop')
    test_loader = DataLoader(test_dataset,
                             12,
                             collate_fn=test_dataset.batchify,
                             shuffle=False,
                             num_workers=5)

    model = OpinioNet.from_pretrained(
        '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch')
    model.load_state_dict(torch.load('../models/saved_best_model_wwm_ext'))
    model.cuda()
    result = eval_epoch(model, test_loader)
    import time
    result.to_csv('../submit/result-' + str(round(time.time())) + '.csv',
                  header=False,