コード例 #1
0
ファイル: run_classifiy.py プロジェクト: bigheiniu/MWSS-1
def build_model(args):
    if args.clf_model.lower() == "cnn":
        # easy for text tokenization
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        model = CNN_Text(args)

    elif args.clf_model.lower() == "robert":
        print("name is {}".format(args.model_name_or_path))
        tokenizer = RobertaTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = RobertaConfig.from_pretrained(args.model_name_or_path,
                                               num_labels=args.num_labels,
                                               finetuning_task=args.task_name)

        model = RobertaForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        if args.freeze:
            for n, p in model.named_parameters():
                if "bert" in n:
                    p.requires_grad = False
    elif args.clf_model.lower() == "bert":
        tokenizer = BertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = BertConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=args.num_labels,
                                            finetuning_task=args.task_name)

        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        # if args.freeze:
        #     for n, p in model.named_parameters():
        #         if "bert" in n:
        #             p.requires_grad = False

    else:
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        config = DistilBertConfig.from_pretrained(
            args.model_name_or_path,
            num_labels=args.num_labels,
            finetuning_task=args.task_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)

    model.expand_class_head(args.multi_head)
    model = model.to(args.device)
    return tokenizer, model
コード例 #2
0
def main(args):
    # # Device configuration
    device = torch.device(
        'cuda:{}'.format(args.gpu) if torch.cuda.is_available() else 'cpu')
    num_epochs = 80
    num_classes = 8
    learning_rate = 0.08
    num_views = 3
    num_layers = 4
    data_path = args.dir
    file_list = [
        './data/train_web_content.npy', './data/train_web_links.npy',
        './data/train_web_title.npy', './data/test_web_content.npy',
        './data/test_web_links.npy', './data/test_web_title.npy',
        './data/train_label.npy', './data/test_label.npy'
    ]
    aaa = list(map(os.path.exists, file_list))
    if sum(aaa) != len(aaa):
        print(
            'Raw data has not been pre-processed! Start pre-processing the raw data.'
        )
        data_loader.preprocess(data_path)
    else:
        print('Loading the existing data set...')
    # train_dataset = data_loader.Load_datasets('train', num_classes)
    train_dataset = data_loader.Load_datasets('train', 8)
    train_loader = DataLoader(train_dataset,
                              batch_size=32,
                              shuffle=True,
                              num_workers=4)
    input_dims = np.array(train_dataset.data[0]).shape
    model = CNN_Text(input_dims, [64, 32, 32, 32], [1, 2, 3, 4], num_classes,
                     0.5, num_layers, num_views).to(device)
    model = model.double()
    model.device = device
    model.learning_rate = learning_rate
    model.epoch = 0
    if args.model != None:
        model.load_state_dict(torch.load(args.mpodel))
        print('Successfully load pre-trained model!')
    # train the model until the model is fully trained
    train_model(model, train_loader, num_epochs)
    print('Finish training process!')
    evaluation(model)
コード例 #3
0
    def __init__(self, config, n_gpu, vocab, train_loader=None, val_loader=None):
        self.config = config
        self.vocab = vocab
        self.n_gpu = n_gpu
        self.train_loader = train_loader
        self.val_loader = val_loader

        # Build model
        vocab_size = self.vocab.vocab_size()

        self.model = CNN_Text(self.config, vocab_size, self.config.n_label)
        self.model.to(device)

        if self.n_gpu > 1:
            self.model = nn.DataParallel(self.model)

        # Build optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=0.0005)

        # Build criterion
        self.criterion = nn.CrossEntropyLoss()
コード例 #4
0
ファイル: main.py プロジェクト: catwang01/NLP-Project
def main_train():
    def clean_str(string):
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string

    TEXT = data.Field(sequential=True, lower=True, batch_first=True)
    TEXT.preprocessing = data.Pipeline(clean_str)
    LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)

    trainset, valset = MR.splits(data_path, fields=[("text", TEXT), ("label", LABEL)])
    TEXT.build_vocab(trainset)

    with open("text.field", 'wb') as f:
        dill.dump(TEXT, f)

    trainiter = data.BucketIterator(trainset, batch_size=batch_size, sort_key=lambda x: len(x.text),
                                    shuffle=True, device=device)

    valiter = data.BucketIterator(valset, batch_size=batch_size, sort_key=lambda x: len(x.text),
                                  shuffle=True, device=device)

    model = CNN_Text(channel_dim, len(TEXT.vocab), embed_dim, output_dim, kernel_sizes, is_static=False,
                     dropout_rate=dropout_rate)
    model = model.to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
    train_model(epochs, model, trainiter, valiter, optimizer, criterion)
コード例 #5
0
    tokens.append('oov')
    tokens.append('bos')
    id = 0
    word2id = {}
    for word in tokens:
        word2id[word] = id
        id += 1
    args.embed_num = len(tokens)
    args.class_num = 2
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

    #print("\nParameters:")
    #for attr, value in sorted(args.__dict__.items()):
    #	print("\t{}={}".format(attr.upper(), value))

    model = CNN_Text(args)

    if torch.cuda.is_available():
        model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    report_interval = 5000
    for epoch in range(1, args.epochs + 1):
        train_batch_i = 0
        batch_counter = 0
        accumulated_loss = 0
        train_sents_scaned = 0
        train_num_correct = 0
        model.train()
        print('--' * 20)
        start_time = time.time()
コード例 #6
0
    #use CUDA to speed up
    use_cuda = torch.cuda.is_available()

    #get data
    train_loader = Data.DataLoader(dataset=CustomDataset(path="train.json",
                                                         balance=False),
                                   batch_size=BATCH_SIZE,
                                   shuffle=True)
    test_loader = Data.DataLoader(dataset=CustomDataset(path="test.json",
                                                        balance=False),
                                  batch_size=BATCH_SIZE,
                                  shuffle=True)

    #initialize model
    cnn = CNN_Text()
    if use_cuda:
        cnn = cnn.cuda()
    optimizer = torch.optim.Adam(cnn.parameters(), lr=LR, weight_decay=0.0005)

    #train
    for epoch in range(EPOCH):
        print("epoch :")
        if epoch % 5 == 0:
            test(cnn, test_loader, use_cuda)
        for step, data in enumerate(train_loader):
            vec, lens, label = data
            #print(vec.shape)
            if use_cuda:
                vec = vec.cuda()
                label = label.cuda()
コード例 #7
0
ファイル: main.py プロジェクト: forheroes1994/ez-classifier
    def train(self, m_2, m_3, m_4):
        word_dict, label_dict = self.divide_two_dict(m_2)
        if self.hyperparameter_1.word_embedding:
            path = "word2vec/glove.6B.100d.txt"
            print("loading word2vec ")
            word_vecs = self.load_my_vector(path, word_dict.m_list)
            print("new words already in word2vec:" + str(len(word_vecs)))
            print("loading unknow word2vec and convert to list... ")
            word_vecs = self.add_unknow_words_by_average(
                word_vecs, word_dict.m_list, k=self.hyperparameter_1.embed_dim)
            print("unknown word2vec load ! and converted to list...")
            # if self.hyperparameter_1.word_embedding:
            self.hyperparameter_1.pretrained_weight = word_vecs
            # pretrained_weight = np.array(self.hyperparameter_1.pretrained_weight)
            # self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight))

        # self.nn = network(2, 2, 2, hidden_layer_weights=None, hidden_layer_bias=None, output_layer_weights=None, output_layer_bias=None)
        train_example = self.out_example_index(m_2, m_2)
        dev_example = self.out_example_index(m_2, m_3)
        test_example = self.out_example_index(m_2, m_4)

        random.shuffle(train_example)
        random.shuffle(dev_example)
        random.shuffle(test_example)

        self.model = CNN_Text(self.hyperparameter_1)
        optimizer = torch.optim.Adam(self.model.parameters(),
                                     lr=self.hyperparameter_1.lr)
        train_example_idx = self.set_index(train_example)
        random.shuffle(train_example_idx)
        steps = 0
        self.model.train()
        for epoch in range(1, self.hyperparameter_1.epochs + 1):
            batchBlock = self.set_batchBlock(train_example)
            for every_batchBlock in range(batchBlock):
                exams = []
                start_pos = every_batchBlock * self.hyperparameter_1.batch_size
                end_pos = (every_batchBlock +
                           1) * self.hyperparameter_1.batch_size
                if end_pos > len(train_example):
                    end_pos = len(train_example)
                for idx in range(start_pos, end_pos):
                    exams.append(train_example[train_example_idx[idx]])
                max_len = self.get_max_sentence_len(exams)
                optimizer.zero_grad()
                feat, label = self.batch(exams,
                                         self.hyperparameter_1.batch_size,
                                         max_len)
                label = label.view(len(exams))
                logit = self.model.forward(feat)
                loss = F.cross_entropy(logit, label)
                loss.backward()
                optimizer.step()
                steps += 1
                if steps % self.hyperparameter_1.log_interval == 0:
                    train_size = len(train_example)
                    corrects = (torch.max(logit, 1)[1].view(
                        label.size()).data == label.data).sum()
                    accuracy = corrects / self.hyperparameter_1.batch_size * 100.0
                    sys.stdout.write(
                        '\rBatch[{}/{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.
                        format(steps, train_size, loss.data[0], accuracy,
                               corrects, self.hyperparameter_1.batch_size))
                if steps % self.hyperparameter_1.test_interval == 0:
                    self.eval(dev_example, self.model)
                if steps % self.hyperparameter_1.save_interval == 0:
                    if not os.path.isdir(self.hyperparameter_1.save_dir):
                        os.makedirs(self.hyperparameter_1.save_dir)
                    save_prefix = os.path.join(self.hyperparameter_1.save_dir,
                                               'snapshot')
                    save_path = '{}_steps{}.pt'.format(save_prefix, steps)
                    torch.save(self.model, save_path)