Esempio n. 1
0
 def test_apply2(self):
     def split_sent(ins):
         return ins['raw_sentence'].split()
     csv_loader = CSVLoader(headers=['raw_sentence', 'label'],sep='\t')
     dataset = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv')
     dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0, inplace=True)
     dataset.apply(split_sent, new_field_name='words', is_input=True)
Esempio n. 2
0
def load_sst2(dict_path, embedding_path=None):
    '''

    :param dict_path: /remote-home/xnli/data/corpus/text_classification/SST-2/
    :param embedding_path: glove 300d txt
    :return:
    '''
    train_path = os.path.join(dict_path, 'train.tsv')
    dev_path = os.path.join(dict_path, 'dev.tsv')

    loader = CSVLoader(headers=('words', 'target'), sep='\t')
    train_data = loader.load(train_path).datasets['train']
    dev_data = loader.load(dev_path).datasets['train']

    train_data.apply_field(lambda x: x.split(),
                           field_name='words',
                           new_field_name='words')
    dev_data.apply_field(lambda x: x.split(),
                         field_name='words',
                         new_field_name='words')

    train_data.apply_field(lambda x: len(x),
                           field_name='words',
                           new_field_name='seq_len')
    dev_data.apply_field(lambda x: len(x),
                         field_name='words',
                         new_field_name='seq_len')

    vocab = Vocabulary(min_freq=2)
    vocab.from_dataset(train_data, field_name='words')
    vocab.from_dataset(dev_data, field_name='words')

    # pretrained_embedding = load_word_emb(embedding_path, 300, vocab)

    label_vocab = Vocabulary(padding=None,
                             unknown=None).from_dataset(train_data,
                                                        field_name='target')

    label_vocab.index_dataset(train_data, field_name='target')
    label_vocab.index_dataset(dev_data, field_name='target')

    vocab.index_dataset(train_data, field_name='words', new_field_name='words')
    vocab.index_dataset(dev_data, field_name='words', new_field_name='words')

    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    dev_data.set_input(Const.INPUT, Const.INPUT_LEN)
    dev_data.set_target(Const.TARGET)

    if embedding_path is not None:
        pretrained_embedding = load_word_emb(embedding_path, 300, vocab)
        return (train_data, dev_data), (vocab,
                                        label_vocab), pretrained_embedding

    else:
        return (train_data, dev_data), (vocab, label_vocab)
Esempio n. 3
0
 def test_CSVLoader(self):
     ds = CSVLoader(sep='\t', headers=['words', 'label']) \
         .load('test/data_for_tests/tutorial_sample_dataset.csv')
     assert len(ds) > 0
Esempio n. 4
0
def run_cnn():
    dataset_train_p2, dataset_test_p2 = get_text_classification_datasets()

    line_len = len(dataset_train_p2.data)
    with open("formalized_train_data.csv", "w") as file:
        for i in range(line_len):
            file.write(
                document2line(dataset_train_p2.data[i]) + "\t" +
                str(dataset_train_p2.target[i]) + '\n')
        file.close()

    line_len = len(dataset_test_p2.data)
    with open("formalized_test_data.csv", "w") as file2:
        for i in range(line_len):
            file2.write(
                document2line(dataset_test_p2.data[i]) + "\t" +
                str(dataset_test_p2.target[i]) + '\n')
        file2.close()

    loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
    train_dataset = loader.load("./formalized_train_data.csv")
    test_dataset = loader.load("./formalized_test_data.csv")

    os.remove("./formalized_train_data.csv")
    os.remove("./formalized_test_data.csv")

    train_dataset.apply(lambda x: x['raw_sentence'].lower(),
                        new_field_name='sentence')
    train_dataset.apply(lambda x: x['sentence'].split(),
                        new_field_name='words',
                        is_input=True)

    test_dataset.apply(lambda x: x['raw_sentence'].lower(),
                       new_field_name='sentence')
    test_dataset.apply(lambda x: x['sentence'].split(),
                       new_field_name='words',
                       is_input=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP import Vocabulary

    # 使用Vocabulary类统计单词,并将单词序列转化为数字序列
    vocab = Vocabulary(min_freq=2).from_dataset(train_dataset,
                                                field_name='words')
    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')
    #train_dataset[0],test_dataset[0]

    # 将label转为整数,并设置为 target
    train_dataset.apply(lambda x: int(x['label']),
                        new_field_name='target',
                        is_target=True)
    test_dataset.apply(lambda x: int(x['label']),
                       new_field_name='target',
                       is_target=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP.models import CNNText
    embed_dim = 2048  #50
    model = CNNText((len(vocab), embed_dim),
                    num_classes=4,
                    padding=2,
                    dropout=0.1)
    model

    from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric

    # 定义trainer并进行训练
    trainer = Trainer(model=model,
                      train_data=train_dataset,
                      dev_data=test_dataset,
                      loss=CrossEntropyLoss(),
                      metrics=AccuracyMetric())
    trainer.train()
Esempio n. 5
0
def run_rnn():
    dataset_train_p2, dataset_test_p2 = get_text_classification_datasets()
    line_len = len(dataset_train_p2.data)
    with open("formalized_train_data.csv", "w") as file:
        for i in range(line_len):
            file.write(
                document2line(dataset_train_p2.data[i]) + "\t" +
                str(dataset_train_p2.target[i]) + '\n')
        file.close()

    line_len = len(dataset_test_p2.data)
    with open("formalized_test_data.csv", "w") as file2:
        for i in range(line_len):
            file2.write(
                document2line(dataset_test_p2.data[i]) + "\t" +
                str(dataset_test_p2.target[i]) + '\n')
        file2.close()

    loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
    train_dataset = loader.load("./formalized_train_data.csv")
    test_dataset = loader.load("./formalized_test_data.csv")

    train_dataset.apply(lambda x: x['raw_sentence'].lower(),
                        new_field_name='sentence')
    train_dataset.apply(lambda x: x['sentence'].split(),
                        new_field_name='words',
                        is_input=True)

    test_dataset.apply(lambda x: x['raw_sentence'].lower(),
                       new_field_name='sentence')
    test_dataset.apply(lambda x: x['sentence'].split(),
                       new_field_name='words',
                       is_input=True)

    from fastNLP import Vocabulary

    # 使用Vocabulary类统计单词,并将单词序列转化为数字序列
    vocab = Vocabulary(min_freq=2).from_dataset(train_dataset,
                                                field_name='words')
    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')
    # 将label转为整数,并设置为 target
    train_dataset.apply(lambda x: int(x['label']),
                        new_field_name='target',
                        is_target=True)
    test_dataset.apply(lambda x: int(x['label']),
                       new_field_name='target',
                       is_target=True)

    embed_dim = 1024
    hidden_dim = 128
    layer = 4

    model = Rnn(len(vocab), embed_dim, hidden_dim, layer, 4)
    use_gpu = torch.cuda.is_available()  # 判断是否有GPU加速
    if use_gpu:
        model = model.cuda()

    trainer = Trainer(model=model,
                      train_data=train_dataset,
                      dev_data=test_dataset,
                      loss=CrossEntropyLoss(),
                      n_epochs=100,
                      metrics=AccuracyMetric())
    trainer.train()