Python get_text_classification_datasetsの例、handout.get_text_classification_datasets Pythonの例

コード例 #1

0

ファイルを表示

ファイル: source.py プロジェクト: zhongyuchen/PRML-Spring19-Fudan

def part2(choose):
    train, test = get_text_classification_datasets()
    print(type(train))
    dictionary = build_dict(train, 10)
    trainset_onehot, t = data_preprocess(train, dictionary)
    testset_onehot, t_ = data_preprocess(test, dictionary)

    w = np.zeros((len(dictionary) + 1, 4))
    w_ = w
    j = 100
    j_ = 99
    count = 0
    c = []
    p = []

    if choose == 1:
        while (count < 5000):
            count += 1
            c.append(count)
            j = j_
            w = w_
            j_, w_ = onecycle(trainset_onehot, t, w, 1)
            p.append(j_)
            print("After ", count, "times training, the loss is:", j_)

    elif choose == 2:
        while (count < 2000):
            count += 1
            c.append(count)
            j = j_
            w = w_
            j_, w_ = onecycle(trainset_onehot, t, w, 10)
            p.append(j_)
            print("After ", count, "times training, the loss is:", j_)

    elif choose == 3:
        while (count < 1000):
            count += 1
            c.append(count)
            j = j_
            w = w_
            j_, w_ = onecycle(trainset_onehot, t, w, 100)
            p.append(j_)
            print("After ", count, "times training, the loss is:", j_)

    elif choose == 4:
        while (np.abs(j - j_) > 1e-4):
            count += 1
            c.append(count)
            j = j_
            w = w_
            j_, w_ = onecycle(trainset_onehot, t, w, BATCH_SIZE)
            p.append(j_)
            print("After ", count, "times training, the loss is:", j_)

    #print(w)
    print('The train accuracy is', precision(w, trainset_onehot, train.target))
    print('The test accuracy is', precision(w, testset_onehot, test.target))
    plt.plot(c, p)
    plt.show()

コード例 #2

0

ファイルを表示

def part2():
    data_text_train, data_text_test = get_text_classification_datasets()
    try:
        with open('train_text.json', 'r', encoding='utf-8') as fp:
            train_text = json.load(fp)
        with open('train_res.json', 'r', encoding='utf-8') as fp:
            train_res = json.load(fp)
        with open('test_text.json', 'r', encoding='utf-8') as fp:
            test_text = json.load(fp)
        with open('test_res.json', 'r', encoding='utf-8') as fp:
            test_res = json.load(fp)
    except Exception as err:
        preprocess_get_test(data_text_test['data'], data_text_test['target'], preprocess_get_dic_train(data_text_train['data'], data_text_train['target']))
        with open('train_text.json', 'r', encoding='utf-8') as fp:
            train_text = json.load(fp)
        with open('train_res.json', 'r', encoding='utf-8') as fp:
            train_res = json.load(fp)
        with open('test_text.json', 'r', encoding='utf-8') as fp:
            test_text = json.load(fp)
        with open('test_res.json', 'r', encoding='utf-8') as fp:
            test_res = json.load(fp)

    train_text, train_res = np.mat(train_text), np.mat(train_res)
    test_text, test_res = np.mat(test_text), np.mat(test_res)


    # select the function for FBGD, SGD or BGD
    w, b = logistic_algorithm_FBGD(train_text, train_res, 0.1, 2000, 0.001, 1e-5)
    # w, b = logistic_algorithm_SGD(train_text, train_res, 0.1, 5000, 0.001, 1e-5)
    # w, b = logistic_algorithm_BGD(train_text, train_res, 0.1, 300, 0.001, 100, 1e-5)
    acc_train, acc_test = get_accuracy(train_text, data_text_train.target, w, b), get_accuracy(test_text, data_text_test.target, w, b)
    print('The training accuracy is: ', acc_train)
    print('The test accuracy is: ', acc_test)

コード例 #3

0

ファイルを表示

def logistic_regression():
    text_train, text_test = get_text_classification_datasets()
    labels_train = creat_labels(text_train.target)
    my_dict = creat_dict(text_train.data)
    vectors_train = get_vectors(text_train.data, my_dict)
    w = np.zeros([vectors_train.shape[1],4])
    lam = 1
    iterations = 1000
    learningRate = 0.005
    losses = []
    for i in range(0,iterations):
        loss,grad = getLoss(w,vectors_train,labels_train,lam)
        losses.append(loss)
        w = w - (learningRate * grad)
    plt.plot(losses)
    plt.show()

コード例 #4

0

ファイルを表示

ファイル: source.py プロジェクト: zhongyuchen/PRML-Spring19-Fudan

def data_preprocess():
    train, test = get_text_classification_datasets()
    # build vacabulary
    train_item = []
    word_count = {}
    vacab = {}
    regular = re.compile(r'[\s]+')
    for item in train.data:
        words = regular.split(
            item.translate(str.maketrans('', '', string.punctuation)).lower())
        train_item.append(words)
        for word in words:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    for word, count in word_count.items():
        if count >= 10:  # only record those >= 10 times
            vacab[word] = len(vacab)
    return train, test, vacab

コード例 #5

0

ファイルを表示

def program_parser():
    parser = argparse.ArgumentParser(description='Assignment 2')

    parser.add_argument('--algorithm',
                        choices=["least_square", "perceptron", "logistic"],
                        help='the algorithms')

    parser.add_argument('--n',
                        choices=["run", "batch", "lambda", "alpha", "check"],
                        default="run",
                        help='the algorithms of logistic')

    args = parser.parse_args()

    linear_dataset = get_linear_seperatable_2d_2c_dataset()
    lsm = LSM(linear_dataset)
    perceptron = Perceptron(linear_dataset)

    algos = {"least_square": lsm.run, "perceptron": perceptron.run}

    if args.algorithm == "logistic":
        np.random.seed(2333)
        dataset_train, dataset_test = get_text_classification_datasets()
        logistic = Logistic(dataset_train, dataset_test)
        if args.n == "run":
            logistic.show()
        elif args.n == "check":
            logistic.check_gradient()
        elif args.n == "batch":
            logistic.show_batch_diff()
        elif args.n == "lambda":
            logistic.show_lamb_diff()
        elif args.n == "alpha":
            logistic.show_alpha_diff()
    elif args.algorithm in algos.keys():
        algos[args.algorithm]()
    else:
        parser.print_help()

コード例 #6

0

ファイルを表示

def get_dataset():
    raw_train, raw_test = get_text_classification_datasets()

    def transfer(st):
        st = st.lower()
        for i in string.whitespace:
            st.replace(i, '')
        for i in string.punctuation:
            st.replace(i, ' ')

        return st

    def preprocess():
        train_set = DataSet()
        for i in range(len(raw_train['data'])):
            di = transfer(raw_train['data'][i])
            train_set.append(
                Instance(sentence=di, target=int(raw_train['target'][i])))

        train_set.apply(lambda x: x['sentence'].lower(),
                        new_field_name='sentence')
        train_set.apply(lambda x: x['sentence'].split(),
                        new_field_name='words')
        train_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

        test_set = DataSet()
        for i in range(len(raw_test['data'])):
            di = transfer(raw_test['data'][i])
            test_set.append(
                Instance(sentence=di, target=int(raw_test['target'][i])))

        test_set.apply(lambda x: x['sentence'].lower(),
                       new_field_name='sentence')
        test_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
        test_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

        word_dict = Vocabulary(min_freq=2)
        train_set.apply(lambda x: [word_dict.add(word) for word in x['words']])
        test_set.apply(lambda x: [word_dict.add(word) for word in x['words']])
        word_dict.build_vocab()
        word_dict.index_dataset(train_set,
                                field_name='words',
                                new_field_name='words')
        word_dict.index_dataset(test_set,
                                field_name='words',
                                new_field_name='words')

        return train_set, test_set, word_dict

    train_set, test_set, word_dict = preprocess()
    train_set.rename_field('words', Const.INPUT)
    train_set.rename_field('seq_len', Const.INPUT_LEN)
    train_set.rename_field('target', Const.TARGET)
    test_set.rename_field('words', Const.INPUT)
    test_set.rename_field('seq_len', Const.INPUT_LEN)
    test_set.rename_field('target', Const.TARGET)

    train_set.set_input(Const.INPUT, Const.INPUT_LEN)
    train_set.set_target(Const.TARGET)
    test_set.set_input(Const.INPUT, Const.INPUT_LEN)
    test_set.set_target(Const.TARGET)

    return train_set, test_set, word_dict

コード例 #7

0

ファイルを表示

ファイル: source.py プロジェクト: reeddotaer/FDU-Pattern-Recognition-and-Machine-Learning

	LSM = LSM()
	LSM.train(train_set.X, 2 * train_set.y - 1)
	y_pred = LSM.predict(test_set.X)
	print("LSM accuracy: ", test_set.acc(y_pred))
	d.plot(plt)
	LSM.plot(plt)

	# Part 2: perceptron
	perceptron = perceptron()
	perceptron.train(train_set.X, train_set.y)
	y_pred = perceptron.predict(test_set.X)
	print("perceptron accuracy: ", test_set.acc(y_pred))
	plt.figure(0)
	perceptron.plot(plt)
	plt.legend(loc="upper right")
	plt.show()
	
	# Part 3: text classification
	text_train, text_test = get_text_classification_datasets()
	train_vector, test_vector = preprocess(text_train, text_test)
	
	N = train_vector.shape[0]
	plt.figure("text_classification")
	for j, i in enumerate([1, batch_size, N]):
		Softmax = softmax()
		Softmax.train(train_vector, text_train.target, batch_size = i)
		print(Softmax.accuracy(test_vector, text_test.target))
		plt.subplot(3, 1, j + 1)
		Softmax.plot(plt)
	plt.show()

コード例 #8

0

ファイルを表示

ファイル: source.py プロジェクト: zhongyuchen/PRML-Spring19-Fudan

ax = plt.subplot(324)
perceptron(data_sample.X, data_sample.y, 30, 0.004)

ax = plt.subplot(325)
perceptron(data_sample.X, data_sample.y, 30, 0.005)

ax = plt.subplot(326)
perceptron(data_sample.X, data_sample.y, 30, 0.006)
# print("accuracy:"+accurate(data_sample.X,data_sample.y,w))
# data_sample.plot(plt).show()

# In[5]:

#part2
from handout import get_text_classification_datasets
trainData, testData = get_text_classification_datasets()

# In[6]:

import string
trainDataset = trainData['data']


def getListX(dataset):
    dic = {}
    for data in trainDataset:
        data = data.lower()
        for i in data:
            if i in string.punctuation:
                data = data.replace(i, " ")
        data = data.split()

コード例 #9

0

ファイルを表示

ファイル: normal_solve.py プロジェクト: zhongyuchen/PRML-Spring19-Fudan

max_epoch_num = args.max_epoch_num

if args.auto_terminate == "True":
    auto_terminate = True
else:
    auto_terminate = False 

observe_dif_times = args.observe_loss_sequence_length
terminate_threshold = args.terminate_threshold


np.random.seed(2019)


if __name__ == '__main__':
    dataset_train, dataset_test = handout.get_text_classification_datasets()
    categories = dataset_train.target_names
     
    # training data and labels
    training_data = (dataset_train.data)
    training_labels = np.array((dataset_train.target))
    
    clean_training_data = utils.clean_dataset(training_data)
    mapping_dict = utils.build_mapping_dict(clean_training_data)
    feature_vector = utils.data2vec(clean_training_data, mapping_dict)
    print(len(feature_vector[0]))
    
    # build model
    softmax_model = model.Softmax_CrossEntropy_model(class_num=len(categories),
                                                     feature_length=feature_vector.shape[1],
                                                     learning_rate=learning_rate,

コード例 #10

0

ファイルを表示

def run_cnn():
    dataset_train_p2, dataset_test_p2 = get_text_classification_datasets()

    line_len = len(dataset_train_p2.data)
    with open("formalized_train_data.csv", "w") as file:
        for i in range(line_len):
            file.write(
                document2line(dataset_train_p2.data[i]) + "\t" +
                str(dataset_train_p2.target[i]) + '\n')
        file.close()

    line_len = len(dataset_test_p2.data)
    with open("formalized_test_data.csv", "w") as file2:
        for i in range(line_len):
            file2.write(
                document2line(dataset_test_p2.data[i]) + "\t" +
                str(dataset_test_p2.target[i]) + '\n')
        file2.close()

    loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
    train_dataset = loader.load("./formalized_train_data.csv")
    test_dataset = loader.load("./formalized_test_data.csv")

    os.remove("./formalized_train_data.csv")
    os.remove("./formalized_test_data.csv")

    train_dataset.apply(lambda x: x['raw_sentence'].lower(),
                        new_field_name='sentence')
    train_dataset.apply(lambda x: x['sentence'].split(),
                        new_field_name='words',
                        is_input=True)

    test_dataset.apply(lambda x: x['raw_sentence'].lower(),
                       new_field_name='sentence')
    test_dataset.apply(lambda x: x['sentence'].split(),
                       new_field_name='words',
                       is_input=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP import Vocabulary

    # 使用Vocabulary类统计单词，并将单词序列转化为数字序列
    vocab = Vocabulary(min_freq=2).from_dataset(train_dataset,
                                                field_name='words')
    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')
    #train_dataset[0],test_dataset[0]

    # 将label转为整数，并设置为 target
    train_dataset.apply(lambda x: int(x['label']),
                        new_field_name='target',
                        is_target=True)
    test_dataset.apply(lambda x: int(x['label']),
                       new_field_name='target',
                       is_target=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP.models import CNNText
    embed_dim = 2048  #50
    model = CNNText((len(vocab), embed_dim),
                    num_classes=4,
                    padding=2,
                    dropout=0.1)
    model

    from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric

    # 定义trainer并进行训练
    trainer = Trainer(model=model,
                      train_data=train_dataset,
                      dev_data=test_dataset,
                      loss=CrossEntropyLoss(),
                      metrics=AccuracyMetric())
    trainer.train()

コード例 #11

0

ファイルを表示

def run_rnn():
    dataset_train_p2, dataset_test_p2 = get_text_classification_datasets()
    line_len = len(dataset_train_p2.data)
    with open("formalized_train_data.csv", "w") as file:
        for i in range(line_len):
            file.write(
                document2line(dataset_train_p2.data[i]) + "\t" +
                str(dataset_train_p2.target[i]) + '\n')
        file.close()

    line_len = len(dataset_test_p2.data)
    with open("formalized_test_data.csv", "w") as file2:
        for i in range(line_len):
            file2.write(
                document2line(dataset_test_p2.data[i]) + "\t" +
                str(dataset_test_p2.target[i]) + '\n')
        file2.close()

    loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
    train_dataset = loader.load("./formalized_train_data.csv")
    test_dataset = loader.load("./formalized_test_data.csv")

    train_dataset.apply(lambda x: x['raw_sentence'].lower(),
                        new_field_name='sentence')
    train_dataset.apply(lambda x: x['sentence'].split(),
                        new_field_name='words',
                        is_input=True)

    test_dataset.apply(lambda x: x['raw_sentence'].lower(),
                       new_field_name='sentence')
    test_dataset.apply(lambda x: x['sentence'].split(),
                       new_field_name='words',
                       is_input=True)

    from fastNLP import Vocabulary

    # 使用Vocabulary类统计单词，并将单词序列转化为数字序列
    vocab = Vocabulary(min_freq=2).from_dataset(train_dataset,
                                                field_name='words')
    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')
    # 将label转为整数，并设置为 target
    train_dataset.apply(lambda x: int(x['label']),
                        new_field_name='target',
                        is_target=True)
    test_dataset.apply(lambda x: int(x['label']),
                       new_field_name='target',
                       is_target=True)

    embed_dim = 1024
    hidden_dim = 128
    layer = 4

    model = Rnn(len(vocab), embed_dim, hidden_dim, layer, 4)
    use_gpu = torch.cuda.is_available()  # 判断是否有GPU加速
    if use_gpu:
        model = model.cuda()

    trainer = Trainer(model=model,
                      train_data=train_dataset,
                      dev_data=test_dataset,
                      loss=CrossEntropyLoss(),
                      n_epochs=100,
                      metrics=AccuracyMetric())
    trainer.train()

コード例 #12

0

ファイルを表示

    for t in ts:
        temp = [0] * K
        temp[t] = 1
        ys.append(temp)

    for t in tts:
        temp = [0] * K
        temp[t] = 1
        tys.append(temp)

    return ys, tys


if __name__ == '__main__':

    x = get_text_classification_datasets()

    # x[0].data[i]
    # x[0].target[i]

    xs, txs = preprocess_data(x[0].data, x[1].data)
    ts, tts = preprocess_target(len(x[0].target_names), x[0].target,
                                x[1].target)
    # print (len(x[0].target_names))
    # print (ts)

    print('preprocessing finish')

    lc = logistic_classifier(xs, ts)

    # Test

コード例 #13

0

ファイルを表示

    draw(plt, X_t, ds_test.y, W, "Picture 4 Test Data")

    draw_taining_loss(plt, epoch, lossHistory)


Perceptron(d, 100, 10, 0.01, plt)

## PART 2

import string
from handout import get_linear_seperatable_2d_2c_dataset, get_text_classification_datasets
import numpy as np
import matplotlib.pyplot as plt
import math

categories, dataset_train_p2, dataset_test_p2 = get_text_classification_datasets(
)


def document_embeding(dataset_train):
    ## build list of string from train data
    list_of_string = []
    tokenized_sentences = []
    trainslator = str.maketrans(string.punctuation,
                                ' ' * len(string.punctuation))
    for ele in dataset_train.data:
        # remove repeated data in one line
        sentence = list(
            set(ele.lower().translate(trainslator).replace(
                string.whitespace, ' ').split()))
        tokenized_sentences.append(sentence)
        list_of_string += sentence

コード例 #14

0

ファイルを表示

ファイル: source.py プロジェクト: zhongyuchen/PRML-Spring19-Fudan

def task2():
	dataset_train, dataset_test = handout.get_text_classification_datasets()
	N = len(dataset_train.data)
	global categories_size, dimension, lam, rate
	lam = 0.1
	rate = 0.1
	categories_size = len(dataset_train.target_names)
	count = {}
	dataset_train.data = deal_text(dataset_train.data)
	dataset_test.data = deal_text(dataset_test.data)
	for text in dataset_train.data:
		words = text.split(' ')
		for word in words:
			if word != "":
				if word not in count:
					count[word] = 0
				count[word] += 1
	vocabulary = {}
	dimension = 0
	for word in count:
		if count[word] >= 10:
			vocabulary[word] = dimension
			dimension += 1
	X, Y = deal_dateset(dataset_train, vocabulary)
	tX, tY = deal_dateset(dataset_test, vocabulary)
	batch_size = int(input("batch size ="))
	batches = [[X[k:k + batch_size], Y[k:k + batch_size]] for k in range(0, N, batch_size)]
	W = np.zeros((dimension, categories_size))
	b = np.zeros((categories_size, 1))
	lam = 0.001
	rate = 0.2
	epoch = 0
	his_loss = 100000
	min_loss = 100000
	max_test_accurcy = 0
	min_loss_epoch = 0
	train_loss_array = []
	test_loss_array = []
	rate_array = []
	while True:
		random.shuffle(batches)
		# update_batch(batches[0], W, b, 1)
		epoch += 1
		print("epoch: ", epoch)
		for batch in batches:
			W, b = update_batch(batch, W, b, 0)

		rate_array.append(rate)
		loss, accuracy = test(X, Y, W, b)
		print("train dataset loss=", loss, ", accuracy=", accuracy)
		# print("learn rate = ", rate)

		test_loss, test_accuracy = test(tX, tY, W, b)
		print("test dataset loss=", test_loss, ", accuracy=", test_accuracy)
		if test_loss < his_loss:
			rate = rate * 1.05
		else:
			rate = 0.1
		his_loss = test_loss

		train_loss_array.append(loss)
		test_loss_array.append(test_loss)
		if test_loss < min_loss:
			min_loss = test_loss
			min_loss_epoch = epoch
		elif epoch - min_loss_epoch > 20:
			break
		max_test_accurcy = max(max_test_accurcy, test_accuracy)

	print("max test dataset accurcy = ", max_test_accurcy)

	time = np.arange(epoch)
	plt.plot(time, train_loss_array)
	plt.plot(time, test_loss_array)
	plt.legend(["train loss", "test loss"])
	plt.show()
	plt.plot(time, rate_array)
	plt.legend(["learning rate"])
	plt.show()

コード例 #15

0

ファイルを表示

    Train = trainer(model=model, data=data, kwargs=kwargs.copy())
    model, info = Train.train()
    
    # plot
    plot_loss_acc(kwargs.copy(), info)

    pred_test_y, _ = model.loss(test_x)
    test_acc = np.sum(np.argmax(pred_test_y, axis=1) == np.argmax(test_y, axis=1)) / len(test_x)
    print("test acc: ")
    print(test_acc, test)



if __name__ == "__main__":
     # load data
    data_set, test_set = get_text_classification_datasets()
    # initialize data processor
    data = {}
    dp = data_processor()
    size_voca = dp.generate_vocabulary(data_set.data)

    # split data
    raw_data, num_classes = split_data(split_point=2000, data_set=data_set)

    # process data
    data["train_x"], data["train_y"] = dp.process_data(
        raw_data["train_x"], raw_data["train_y"], num_classes)
    data["val_x"], data["val_y"] = dp.process_data(
        raw_data["val_x"], raw_data["val_y"], num_classes)
    
    # choose learning rate