def test_apply2(self): def split_sent(ins): return ins['raw_sentence'].split() csv_loader = CSVLoader(headers=['raw_sentence', 'label'],sep='\t') dataset = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv') dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0, inplace=True) dataset.apply(split_sent, new_field_name='words', is_input=True)
def load_sst2(dict_path, embedding_path=None): ''' :param dict_path: /remote-home/xnli/data/corpus/text_classification/SST-2/ :param embedding_path: glove 300d txt :return: ''' train_path = os.path.join(dict_path, 'train.tsv') dev_path = os.path.join(dict_path, 'dev.tsv') loader = CSVLoader(headers=('words', 'target'), sep='\t') train_data = loader.load(train_path).datasets['train'] dev_data = loader.load(dev_path).datasets['train'] train_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') dev_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') train_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dev_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') vocab = Vocabulary(min_freq=2) vocab.from_dataset(train_data, field_name='words') vocab.from_dataset(dev_data, field_name='words') # pretrained_embedding = load_word_emb(embedding_path, 300, vocab) label_vocab = Vocabulary(padding=None, unknown=None).from_dataset(train_data, field_name='target') label_vocab.index_dataset(train_data, field_name='target') label_vocab.index_dataset(dev_data, field_name='target') vocab.index_dataset(train_data, field_name='words', new_field_name='words') vocab.index_dataset(dev_data, field_name='words', new_field_name='words') train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) dev_data.set_input(Const.INPUT, Const.INPUT_LEN) dev_data.set_target(Const.TARGET) if embedding_path is not None: pretrained_embedding = load_word_emb(embedding_path, 300, vocab) return (train_data, dev_data), (vocab, label_vocab), pretrained_embedding else: return (train_data, dev_data), (vocab, label_vocab)
def test_CSVLoader(self): ds = CSVLoader(sep='\t', headers=['words', 'label']) \ .load('test/data_for_tests/tutorial_sample_dataset.csv') assert len(ds) > 0
def run_cnn(): dataset_train_p2, dataset_test_p2 = get_text_classification_datasets() line_len = len(dataset_train_p2.data) with open("formalized_train_data.csv", "w") as file: for i in range(line_len): file.write( document2line(dataset_train_p2.data[i]) + "\t" + str(dataset_train_p2.target[i]) + '\n') file.close() line_len = len(dataset_test_p2.data) with open("formalized_test_data.csv", "w") as file2: for i in range(line_len): file2.write( document2line(dataset_test_p2.data[i]) + "\t" + str(dataset_test_p2.target[i]) + '\n') file2.close() loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') train_dataset = loader.load("./formalized_train_data.csv") test_dataset = loader.load("./formalized_test_data.csv") os.remove("./formalized_train_data.csv") os.remove("./formalized_test_data.csv") train_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') train_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) test_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') test_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) #train_dataset[0],test_dataset[0] from fastNLP import Vocabulary # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 vocab = Vocabulary(min_freq=2).from_dataset(train_dataset, field_name='words') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') #train_dataset[0],test_dataset[0] # 将label转为整数,并设置为 target train_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) test_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) #train_dataset[0],test_dataset[0] from fastNLP.models import CNNText embed_dim = 2048 #50 model = CNNText((len(vocab), embed_dim), num_classes=4, padding=2, dropout=0.1) model from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric # 定义trainer并进行训练 trainer = Trainer(model=model, train_data=train_dataset, dev_data=test_dataset, loss=CrossEntropyLoss(), metrics=AccuracyMetric()) trainer.train()
def run_rnn(): dataset_train_p2, dataset_test_p2 = get_text_classification_datasets() line_len = len(dataset_train_p2.data) with open("formalized_train_data.csv", "w") as file: for i in range(line_len): file.write( document2line(dataset_train_p2.data[i]) + "\t" + str(dataset_train_p2.target[i]) + '\n') file.close() line_len = len(dataset_test_p2.data) with open("formalized_test_data.csv", "w") as file2: for i in range(line_len): file2.write( document2line(dataset_test_p2.data[i]) + "\t" + str(dataset_test_p2.target[i]) + '\n') file2.close() loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') train_dataset = loader.load("./formalized_train_data.csv") test_dataset = loader.load("./formalized_test_data.csv") train_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') train_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) test_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') test_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) from fastNLP import Vocabulary # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 vocab = Vocabulary(min_freq=2).from_dataset(train_dataset, field_name='words') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') # 将label转为整数,并设置为 target train_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) test_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) embed_dim = 1024 hidden_dim = 128 layer = 4 model = Rnn(len(vocab), embed_dim, hidden_dim, layer, 4) use_gpu = torch.cuda.is_available() # 判断是否有GPU加速 if use_gpu: model = model.cuda() trainer = Trainer(model=model, train_data=train_dataset, dev_data=test_dataset, loss=CrossEntropyLoss(), n_epochs=100, metrics=AccuracyMetric()) trainer.train()