Beispiel #1
0
def easyTrain(confdict):
    print('Model Train')
    data = Data()
    data.read_config(confdict)
    data.HP_gpu = torch.cuda.is_available()
    data_initialization(data)
    data.generate_instance('train')
    data.generate_instance('dev')
    data.generate_instance('test')
    data.build_pretrain_emb()
    f1 = train(data)
    return f1
Beispiel #2
0
def dispatch(config=None, status="train", data=None):
    if data is None:
        data = Data()
        data.HP_gpu = torch.cuda.is_available()
        data.read_config(config)
    else:
        data.HP_gpu = torch.cuda.is_available()

    data.show_data_summary()
    status = data.status.lower()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        return train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(config)
        print(data.raw_dir)
        # exit(0)
        data.show_data_summary()
        data.generate_instance('raw')
        print("nbest: %s" % (data.nbest))
        decode_results, pred_scores = load_model_decode(data, 'raw')
        if data.nbest and not data.sentence_classification:
            data.write_nbest_decoded_results(decode_results, pred_scores,
                                             'raw')
        else:
            data.write_decoded_results(decode_results, 'raw')
    else:
        print(
            "Invalid argument! Please use valid arguments! (train/test/decode)"
        )
Beispiel #3
0
    args = parser.parse_args()
    data = Data()
    data.HP_gpu = torch.cuda.is_available()
    data.read_config(args.config)
    data.show_data_summary()
    status = data.status.lower()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(args.config)
        print(data.raw_dir)
        # exit(0)
        data.show_data_summary()
        data.generate_instance('raw')
        print("nbest: %s" % (data.nbest))
        decode_results, pred_scores = load_model_decode(data, 'raw')
        if data.nbest and not data.sentence_classification:
            data.write_nbest_decoded_results(decode_results, pred_scores,
                                             'raw')
        else:
def main():
    parser = argparse.ArgumentParser(description='Tuning with NCRF++')
    # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
    parser.add_argument('--config', help='Configuration File', default='None')
    parser.add_argument('--wordemb',
                        help='Embedding for words',
                        default='None')
    parser.add_argument('--charemb',
                        help='Embedding for chars',
                        default='None')
    parser.add_argument('--status',
                        choices=['train', 'decode'],
                        help='update algorithm',
                        default='train')
    parser.add_argument('--savemodel',
                        default="data/model/saved_model.lstmcrf.")
    parser.add_argument('--savedset', help='Dir of saved data setting')
    parser.add_argument('--train', default="data/conll03/train.bmes")
    parser.add_argument('--dev', default="data/conll03/dev.bmes")
    parser.add_argument('--test', default="data/conll03/test.bmes")
    parser.add_argument('--seg', default="True")
    parser.add_argument('--random-seed', type=int, default=42)
    parser.add_argument('--lr', type=float)
    parser.add_argument('--batch-size', type=int)
    parser.add_argument('--raw')
    parser.add_argument('--loadmodel')
    parser.add_argument('--output')
    parser.add_argument('--output-tsv')
    parser.add_argument('--model-prefix')
    parser.add_argument('--cpu', action='store_true')

    args = parser.parse_args()

    # Set random seed
    seed_num = args.random_seed
    random.seed(seed_num)
    torch.manual_seed(seed_num)
    np.random.seed(seed_num)

    data = Data()
    data.random_seed = seed_num
    data.HP_gpu = torch.cuda.is_available()
    if args.config == 'None':
        data.train_dir = args.train
        data.dev_dir = args.dev
        data.test_dir = args.test
        data.model_dir = args.savemodel
        data.dset_dir = args.savedset
        print("Save dset directory:", data.dset_dir)
        save_model_dir = args.savemodel
        data.word_emb_dir = args.wordemb
        data.char_emb_dir = args.charemb
        if args.seg.lower() == 'true':
            data.seg = True
        else:
            data.seg = False
        print("Seed num:", seed_num)
    else:
        data.read_config(args.config)
    if args.lr:
        data.HP_lr = args.lr
    if args.batch_size:
        data.HP_batch_size = args.batch_size
    data.output_tsv_path = args.output_tsv
    if args.cpu:
        data.HP_gpu = False
    if args.model_prefix:
        data.model_dir = args.model_prefix

    # data.show_data_summary()
    status = data.status.lower()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(args.config)
        print(data.raw_dir)
        # exit(0)
        data.show_data_summary()
        data.generate_instance('raw')
        print("nbest: %s" % (data.nbest))
        decode_results, pred_scores = load_model_decode(data, 'raw')
        if data.nbest and not data.sentence_classification:
            data.write_nbest_decoded_results(decode_results, pred_scores,
                                             'raw')
        else:
            data.write_decoded_results(decode_results, 'raw')
    else:
        print(
            "Invalid argument! Please use valid arguments! (train/test/decode)"
        )
Beispiel #5
0
        train_data.read_config(args.train)
        train_enc = l.Encoding(train_data.encoding, train_data.postag_type)
        dict_encoded, all_sent, _ = train_enc.encode(train_data.dev_gold)
        processing.write_to_conllu(dict_encoded, train_data.dev_enc_dep2label,
                                   0)
        train_data.HP_gpu = torch.cuda.is_available()
        print("Seed num:", seed_num)
        train_enc = l.Encoding(train_data.encoding, train_data.postag_type)
        dict_encoded, all_sent, _ = train_enc.encode(train_data.train_gold)
        processing.write_to_conllu(dict_encoded,
                                   train_data.train_enc_dep2label, 0)
        data_initialization(train_data)
        train_data.generate_instance('train')
        train_data.generate_instance('dev')
        #train_data.generate_instance('test')
        train_data.build_pretrain_emb()
        train(train_data, decode, args)

    else:
        #DECODE
        decode.HP_gpu = torch.cuda.is_available()
        decode.load(decode.dset_dir)
        #decode_data.show_data_summary()
        decode.read_config(args.decode)
        if decode.eval_type == "CONLLU":
            lookup = processing.dump_into_lookup(decode.input)
        dev_enc = l.Encoding(0, decode.postag_type)
        #pass NCRF a file with labels column as 0 for sanity check
        dict_encoded, all_sent, all_text = dev_enc.encode(decode.input)
        processing.write_to_conllu(dict_encoded, decode.raw_dir, 0)
        decode.generate_instance('raw')
Beispiel #6
0
    status = cross_data.status.lower()

    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")

        transfer_flag = False
        if cross_data.mode == 'supervised':
            data_init_supervised(cross_data)
        elif cross_data.mode == 'transfer':
            data_init_transfer(cross_data)
            transfer_flag = True

        cross_data.generate_instance('train', transfer_flag)
        cross_data.generate_instance('dev', transfer_flag)
        cross_data.generate_instance('test', transfer_flag)
        cross_data.build_pretrain_emb()
        train(cross_data)

    elif status == 'decode':
        print("MODEL: decode")
        cross_data.load(cross_data.init_dir)
        cross_data.read_config(args.config)
        cross_data.show_data_summary()
        load_model_decode(cross_data)
    else:
        print(
            "Invalid argument! Please use valid arguments! (train/test/decode)"
        )
Beispiel #7
0
class NCRF:
    def __init__(self):
        # print("Python Version: %s.%s"%(sys.version_info[0],sys.version_info[1]))
        # print("PyTorch Version:%s"%(torch.__version__))
        # print("Process ID: ", os.getpid())
        self.data = Data()
        self.data.HP_gpu = torch.cuda.is_available()
        if self.data.HP_gpu:
            self.data.device = 'cuda'
        # print("GPU:", self.data.HP_gpu, "; device:", self.data.device)
        self.optimizer = None
        self.model = None

    def read_data_config_file(self, config_dir):
        self.data.read_config(config_dir)

    def manual_data_setting(self, setting_dict):
        ## set data through manual dict, all value should be in string format.
        self.data.manual_config(setting_dict)

    def initialize_model_and_optimizer(self):
        if self.data.sentence_classification:
            self.model = SentClassifier(self.data)
        else:
            self.model = SeqLabel(self.data)
        if self.data.optimizer.lower() == "sgd":
            self.optimizer = optim.SGD(self.model.parameters(),
                                       lr=self.data.HP_lr,
                                       momentum=self.data.HP_momentum,
                                       weight_decay=self.data.HP_l2)
        elif self.data.optimizer.lower() == "adagrad":
            self.optimizer = optim.Adagrad(self.model.parameters(),
                                           lr=self.data.HP_lr,
                                           weight_decay=self.data.HP_l2)
        elif self.data.optimizer.lower() == "adadelta":
            self.optimizer = optim.Adadelta(self.model.parameters(),
                                            lr=self.data.HP_lr,
                                            weight_decay=self.data.HP_l2)
        elif self.data.optimizer.lower() == "rmsprop":
            self.optimizer = optim.RMSprop(self.model.parameters(),
                                           lr=self.data.HP_lr,
                                           weight_decay=self.data.HP_l2)
        elif self.data.optimizer.lower() == "adam":
            self.optimizer = optim.Adam(self.model.parameters(),
                                        lr=self.data.HP_lr,
                                        weight_decay=self.data.HP_l2)
        else:
            print("Optimizer illegal: %s" % (self.data.optimizer))
            exit(1)

    def initialize_data(self, input_list=None):
        self.data.initial_alphabets(input_list)
        if self.data.use_word_emb and self.data.use_word_seq:
            self.data.build_pretrain_emb()

    def initialization(self, input_list=None):
        ## must initialize data before initialize model and optimizer, as alphabet size and pretrain emb matters
        self.num_ = '''
        input_list: [train_list, dev_list, test_list]
              train_list/dev_list/test_list: [sent_list, label_list, feature_list]
                      sent_list: list of list [[word1, word2,...],...,[wordx, wordy]...]
                      label_list:     if sentence_classification: 
                                           list of labels [label1, label2,...labelx, labely,...]
                                      else: 
                                           list of list [[label1, label2,...],...,[labelx, labely,...]]
                      feature_list:   if sentence_classification: 
                                           list of labels [[feat1, feat2,..],...,[feat1, feat2,..]], len(feature_list)= sentence_num
                                      else: 
                                           list of list [[[feat1, feat2,..],...,[feat1, feat2,..]],...,[[feat1, feat2,..],...,[feat1, feat2,..]]], , len(feature_list)= sentence_num
        '''
        self.initialize_data(input_list)
        self.and_optimizer = self.initialize_model_and_optimizer()

    def self_generate_instances(self):
        self.data.generate_instance('train')
        self.data.generate_instance('dev')
        self.data.generate_instance('test')

    def generate_instances_from_list(self, input_list, name):
        return self.data.generate_instance_from_list(input_list, name)

    def save(self, model_dir="ncrf.model"):
        # print("Save model to file: ", model_dir)
        the_dict = {
            'data': self.data,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }
        torch.save(the_dict, model_dir)

    def load(self, model_dir="ncrf.model"):
        the_dict = torch.load(model_dir)
        self.data = the_dict['data']
        self.data.silence = True
        ## initialize the model and optimizer befor load state dict
        self.initialize_model_and_optimizer()
        self.model.load_state_dict(the_dict['state_dict'])
        self.optimizer.load_state_dict(the_dict['optimizer'])
        # print("Model loaded from file: ", model_dir)

    def train(self, train_Ids=None, save_model_dir=None):
        '''
        train_Ids: list of words, chars and labels, various length. [[words, features, chars, labels],[words, features, chars,labels],...]
            words: word ids for one sentence. (batch_size, sent_len)
            features: features ids for one sentence. (batch_size, sent_len, feature_num)
            chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
            labels: label ids for one sentence. (batch_size, sent_len)
        save_model_dir: model name to be saved
        '''
        if train_Ids:
            self.data.train_Ids = train_Ids
        # print(self.data.train_Ids[0])
        print('-----begin train------')
        # exit(0)
        best_dev = -10
        best_model = None
        for idx in range(self.data.HP_iteration):
            epoch_start = time.time()
            temp_start = epoch_start
            # print("Epoch: %s/%s" %(idx,self.data.HP_iteration))
            if self.data.optimizer == "SGD":
                self.optimizer = lr_decay(self.optimizer, idx,
                                          self.data.HP_lr_decay,
                                          self.data.HP_lr)
            instance_count = 0
            sample_id = 0
            sample_loss = 0
            total_loss = 0
            right_token = 0
            whole_token = 0
            random.shuffle(self.data.train_Ids)
            first_list = ", ".join([
                self.data.word_alphabet.get_instance(a)
                for a in self.data.train_Ids[0][0]
            ])
            # print("Shuffle: first input: [%s]" %(first_list))
            ## set model in train model
            self.model.train()

            batch_size = self.data.HP_batch_size
            batch_id = 0
            train_num = len(self.data.train_Ids)
            total_batch = train_num // batch_size + 1
            for batch_id in range(total_batch):
                self.optimizer.zero_grad()
                start = batch_id * batch_size
                end = (batch_id + 1) * batch_size
                if end > train_num:
                    end = train_num
                instance = self.data.train_Ids[start:end]
                if not instance:
                    continue
                batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_word_text, batch_label, mask = batchify_with_label(
                    instance, self.data.HP_gpu, True,
                    self.data.sentence_classification)
                instance_count += 1
                loss, tag_seq = self.model.calculate_loss(
                    batch_word, batch_features, batch_wordlen, batch_char,
                    batch_charlen, batch_charrecover, batch_word_text,
                    batch_label, mask)
                right, whole = predict_check(tag_seq, batch_label, mask,
                                             self.data.sentence_classification)
                right_token += right
                whole_token += whole
                # print("loss:",loss.item())
                sample_loss += loss.item()
                total_loss += loss.item()
                if end % 300000 == 0:
                    temp_time = time.time()
                    temp_cost = temp_time - temp_start
                    temp_start = temp_time
                    print(
                        "     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"
                        %
                        (end, temp_cost, sample_loss, right_token, whole_token,
                         (right_token + 0.) / whole_token))
                    if sample_loss > 1e8 or str(sample_loss) == "nan":
                        print(
                            "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                        )
                        exit(1)
                    sys.stdout.flush()
                    sample_loss = 0
                loss.backward()
                self.optimizer.step()
            temp_time = time.time()
            temp_cost = temp_time - temp_start
            # print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))

            epoch_finish = time.time()
            epoch_cost = epoch_finish - epoch_start
            print(
                "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"
                % (idx, epoch_cost, train_num / epoch_cost, total_loss))
            # print("totalloss:", total_loss)
            if total_loss > 1e8 or str(total_loss) == "nan":
                print(
                    "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                )
                exit(1)
            # continue
            speed, f = evaluate(self.data, self.model, "dev")
            dev_finish = time.time()
            dev_cost = dev_finish - epoch_finish
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s;  f: %.4f" %
                  (dev_cost, speed, f))
            if current_score > best_dev:
                # if self.data.seg:
                print("Exceed previous best f score:", best_dev)
                _, f = evaluate(self.data, self.model, "test")
                print("Test:  f: %.4f" % (f))
                # _ ,  f = evaluate(self.data, self.model, "test")
                # if self.data.seg:
                # print("Test:  f: %.4f"%(f))
                # else:
                #     print("Exceed previous best f score:", best_dev)
                # if save_model_dir == None:
                #     model_name = self.data.model_dir + ".model"
                # else:
                #     model_name = save_model_dir  + ".model"
                # self.save(model_name)
                # torch.save(model.state_dict(), model_name)
                best_dev = current_score
                # best_model = model_name
            ## decode test
            # else:
            #     print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
            gc.collect()
        # if best_model != None:
        #     self.load(best_model)
        #     _ ,  f = evaluate(self.data, self.model, "test")
        #     print("Test:  f: %.4f"%(f))

    # def evaluate(self):

    def decode(self, raw_Ids):
        '''
        raw_Ids: list of words, chars and labels, various length. [[words, features, chars, labels],[words, features, chars,labels],...]
            words: word ids for one sentence. (batch_size, sent_len)
            features: features ids for one sentence. (batch_size, sent_len, feature_num)
            chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
            labels: label ids for one sentence. (batch_size, sent_len)
            ## label should be padded in raw input
        '''
        instances = raw_Ids
        ## set model in eval model
        self.model.eval()
        batch_size = self.data.HP_batch_size
        instance_num = len(instances)
        total_batch = instance_num // batch_size + 1
        decode_label = []
        for batch_id in tqdm(range(total_batch)):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > instance_num:
                end = instance_num
            instance = instances[start:end]
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_word_text, batch_label, mask = batchify_with_label(
                instance, self.data.HP_gpu, False,
                self.data.sentence_classification)
            tag_seq = self.model(batch_word, batch_features, batch_wordlen,
                                 batch_char, batch_charlen, batch_charrecover,
                                 batch_word_text, None, mask)
            tag_seq = tag_seq[batch_wordrecover.cpu()]
            decode_label += tag_seq.cpu().data.numpy().tolist()
        return decode_label

    def decode_prob(self, raw_Ids):
        '''
        raw_Ids: list of words, chars and labels, various length. [[words, features, chars, labels],[words, features, chars,labels],...]
            words: word ids for one sentence. (batch_size, sent_len)
            features: features ids for one sentence. (batch_size, sent_len, feature_num)
            chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
            labels: label ids for one sentence. (batch_size, sent_len)
            ## label should be padded in raw input
        '''
        if not self.data.sentence_classification:
            print(
                "decode probability is only valid in sentence classification task. Exit."
            )
            exit(0)
        instances = raw_Ids
        target_probability_list = []
        target_result_list = []
        ## set model in eval model
        self.model.eval()
        batch_size = self.data.HP_batch_size
        instance_num = len(instances)
        total_batch = instance_num // batch_size + 1
        for batch_id in tqdm(range(total_batch)):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > instance_num:
                end = instance_num
            instance = instances[start:end]
            if start % 10000 == 0:
                print("Decode: ", start)
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_word_text, batch_label, mask = batchify_with_label(
                instance, self.data.HP_gpu, False,
                self.data.sentence_classification)
            target_probability, _ = self.model.get_target_probability(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, batch_word_text, None, mask)
            target_probability = target_probability[batch_wordrecover.cpu()]
            target_probability_list.append(target_probability)
        target_probabilities = np.concatenate(target_probability_list, axis=0)
        return target_probabilities

    def decode_prob_and_attention_weights(self, raw_Ids):
        '''
        raw_Ids: list of words, chars and labels, various length. [[words, features, chars, labels],[words, features, chars,labels],...]
            words: word ids for one sentence. (batch_size, sent_len)
            features: features ids for one sentence. (batch_size, sent_len, feature_num)
            chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
            labels: label ids for one sentence. (batch_size, sent_len)
            ## label should be padded in raw input
        '''
        if not self.data.sentence_classification:
            print(
                "decode probability is only valid in sentence classification task. Exit."
            )
            exit(0)
        if self.data.words2sent_representation.upper(
        ) != "ATTENTION" and self.data.words2sent_representation.upper(
        ) != "ATT":
            print(
                "attention weights are only valid in attention model. Current: %s,  Exit."
                % (self.data.words2sent_representation))
            exit(0)
        instances = raw_Ids
        target_probability_list = []
        sequence_attention_weight_list = []
        ## set model in eval model
        self.model.eval()
        batch_size = self.data.HP_batch_size
        instance_num = len(instances)
        total_batch = instance_num // batch_size + 1
        for batch_id in tqdm(range(total_batch)):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > instance_num:
                end = instance_num
            instance = instances[start:end]
            if start % 10000 == 0:
                print("Decode: ", start)
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_word_text, batch_label, mask = batchify_with_label(
                instance, self.data.HP_gpu, False,
                self.data.sentence_classification)
            target_probability, weights = self.model.get_target_probability(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, batch_word_text, None, mask)
            ## target_probability, weights are both numpy
            target_probability = target_probability[batch_wordrecover.cpu()]
            weights = weights[batch_wordrecover.cpu()]
            target_probability_list.append(target_probability)
            sequence_attention_weight_list += weights.tolist()
        target_probabilities = np.concatenate(target_probability_list, axis=0)
        print(len(sequence_attention_weight_list))
        ## sequence_attention_weight_list: list with different batch size and many padded 0
        return target_probabilities, sequence_attention_weight_list
Beispiel #8
0
 data.HP_gpu = torch.cuda.is_available()
 print("Seed num:",seed_num)
 data.number_normalized = True
 data.word_emb_dir = "../data/glove.6B.100d.txt"
 
 if status == 'train':
     print("MODEL: train")
     data_initialization(data)
     data.use_char = True
     data.HP_batch_size = 10
     data.HP_lr = 0.015
     data.char_seq_feature = "CNN"
     data.generate_instance('train')
     data.generate_instance('dev')
     data.generate_instance('test')
     data.build_pretrain_emb()
     train(data)
 elif status == 'decode':   
     print("MODEL: decode")
     data.load(data.dset_dir)    
     data.raw_dir = args.raw
     data.decode_dir = args.output
     data.load_model_dir = args.loadmodel
     data.show_data_summary()
     data.generate_instance('raw')
     print("nbest: %s"%(data.nbest))
     decode_results, pred_scores = load_model_decode(data, 'raw')
     if data.nbest:
         data.write_nbest_decoded_results(decode_results, pred_scores, 'raw')
     else:
         data.write_decoded_results(decode_results, 'raw')