Esempio n. 1
0
 def generate_instance(self, name):
     self.fix_alphabet()
     if name == "train":
         self.train_texts, self.train_Ids = read_instance(
             self.train_dir, self.word_alphabet, self.char_alphabet,
             self.feature_alphabets, self.label_alphabet,
             self.number_normalized, self.MAX_SENTENCE_LENGTH)
     elif name == "dev":
         self.dev_texts, self.dev_Ids = read_instance(
             self.dev_dir, self.word_alphabet, self.char_alphabet,
             self.feature_alphabets, self.label_alphabet,
             self.number_normalized, self.MAX_SENTENCE_LENGTH)
     elif name == "test":
         self.test_texts, self.test_Ids = read_instance(
             self.test_dir, self.word_alphabet, self.char_alphabet,
             self.feature_alphabets, self.label_alphabet,
             self.number_normalized, self.MAX_SENTENCE_LENGTH)
     elif name == "raw":
         self.raw_texts, self.raw_Ids = read_instance(
             self.raw_dir, self.word_alphabet, self.char_alphabet,
             self.feature_alphabets, self.label_alphabet,
             self.number_normalized, self.MAX_SENTENCE_LENGTH)
     else:
         print(
             "Error: you can only generate train/dev/test instance! Illegal input:%s"
             % (name))
Esempio n. 2
0
def evaluate(data, model, name, nbest=None):
    if name == "train":
        instances = data.train_Ids
    elif name == "dev":
        instances = data.dev_Ids
    elif name == 'test':
        instances = data.test_Ids
    elif name == 'raw':
        instances = data.raw_Ids
    else:
        print("Error: wrong evaluate name,", name)
    right_token = 0
    whole_token = 0
    nbest_pred_results = []
    pred_scores = []
    pred_results = []
    gold_results = []
    ## set model in eval model
    model.eval()
    batch_size = data.HP_batch_size
    start_time = time.time()
    train_num = len(instances)
    total_batch = train_num // batch_size + 1
    for batch_id in range(total_batch):
        start = batch_id * batch_size
        end = (batch_id + 1) * batch_size
        if end > train_num:
            end = train_num
        instance = instances[start:end]
        if not instance:
            continue
        batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(
            instance, data.HP_gpu, True)
        if nbest:
            scores, nbest_tag_seq = model.decode_nbest(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, mask, nbest)
            nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask,
                                                    data.label_alphabet,
                                                    batch_wordrecover)
            nbest_pred_results += nbest_pred_result
            pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist(
            )
            ## select the best sequence to evalurate
            tag_seq = nbest_tag_seq[:, :, 0]
        else:
            tag_seq = model(batch_word, batch_features, batch_wordlen,
                            batch_char, batch_charlen, batch_charrecover, mask)
        # print("tag:",tag_seq)
        pred_label, gold_label = recover_label(tag_seq, batch_label, mask,
                                               data.label_alphabet,
                                               batch_wordrecover)
        pred_results += pred_label
        gold_results += gold_label
    decode_time = time.time() - start_time
    speed = len(instances) / decode_time
    acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme)
    if nbest:
        return speed, acc, p, r, f, nbest_pred_results, pred_scores
    return speed, acc, p, r, f, pred_results, pred_scores
Esempio n. 3
0
def IOB2BIO(input_file, output_file):
    print("Convert IOB -> BIO for file: %s", input_file)
    with open(input_file,'r') as in_file:
        fins = in_file.readlines()
    fout = open(output_file,'w')
    words = []
    labels = []
    for line in fins:
        if len(line) < 3:
            sent_len = len(words)
            for idx in range(sent_len):
                if "I-" in labels[idx]:
                    label_type = labels[idx].split('-')[-1]
                    if (idx == 0) or (labels[idx-1] == "O") or (label_type != labels[idx-1].split('-')[-1]):
                        fout.write(words[idx]+" B-"+label_type+"\n")
                    else:
                        fout.write(words[idx]+" "+labels[idx]+"\n")
                else:
                    fout.write(words[idx]+" "+labels[idx]+"\n")
            fout.write('\n')
            words = []
            labels = []
        else:
            pair = line.strip('\n').split()
            words.append(pair[0])
            labels.append(pair[-1].upper())
    fout.close()
    print("BIO file generated: %s", output_file)
Esempio n. 4
0
def recover_nbest_label(pred_variable, mask_variable, label_alphabet,
                        word_recover):
    """
        input:
            pred_variable (batch_size, sent_len, nbest): pred tag result
            mask_variable (batch_size, sent_len): mask variable
            word_recover (batch_size)
        output:
            nbest_pred_label list: [batch_size, nbest, each_seq_len]
    """
    # print("word recover:", word_recover.size())
    # exit(0)
    pred_variable = pred_variable[word_recover]
    mask_variable = mask_variable[word_recover]
    batch_size = pred_variable.size(0)
    seq_len = pred_variable.size(1)
    print(pred_variable.size())
    nbest = pred_variable.size(2)
    mask = mask_variable.cpu().data.numpy()
    pred_tag = pred_variable.cpu().data.numpy()
    batch_size = mask.shape[0]
    pred_label = []
    for idx in range(batch_size):
        pred = []
        for idz in range(nbest):
            each_pred = [
                label_alphabet.get_instance(pred_tag[idx][idy][idz])
                for idy in range(seq_len) if mask[idx][idy] != 0
            ]
            pred.append(each_pred)
        pred_label.append(pred)
    return pred_label
Esempio n. 5
0
 def write_decoded_results(self, predict_results, name):
     fout = open(self.decode_dir, 'w')
     sent_num = len(predict_results)
     content_list = []
     if name == 'raw':
         content_list = self.raw_texts
     elif name == 'test':
         content_list = self.test_texts
     elif name == 'dev':
         content_list = self.dev_texts
     elif name == 'train':
         content_list = self.train_texts
     else:
         print(
             "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
         )
     assert (sent_num == len(content_list))
     for idx in range(sent_num):
         sent_length = len(predict_results[idx])
         for idy in range(sent_length):
             ## content_list[idx] is a list with [word, char, label]
             fout.write(content_list[idx][0][idy] + " " +
                        predict_results[idx][idy] + '\n')
         fout.write('\n')
     fout.close()
     print("Predict %s result has been written into file. %s" %
           (name, self.decode_dir))
Esempio n. 6
0
 def __init__(self,
              alphabet_size,
              pretrain_char_embedding,
              embedding_dim,
              hidden_dim,
              dropout,
              gpu,
              bidirect_flag=True):
     super(CharBiGRU, self).__init__()
     print("build char sequence feature extractor: GRU ...")
     self.gpu = gpu
     self.hidden_dim = hidden_dim
     if bidirect_flag:
         self.hidden_dim = hidden_dim // 2
     self.char_drop = nn.Dropout(dropout)
     self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
     if pretrain_char_embedding is not None:
         self.char_embeddings.weight.data.copy_(
             torch.from_numpy(pretrain_char_embedding))
     else:
         self.char_embeddings.weight.data.copy_(
             torch.from_numpy(
                 self.random_embedding(alphabet_size, embedding_dim)))
     self.char_lstm = nn.GRU(embedding_dim,
                             self.hidden_dim,
                             num_layers=1,
                             batch_first=True,
                             bidirectional=bidirect_flag)
     if self.gpu:
         self.char_drop = self.char_drop.cuda()
         self.char_embeddings = self.char_embeddings.cuda()
         self.char_lstm = self.char_lstm.cuda()
Esempio n. 7
0
 def get_instance(self, index):
     if index == 0:
         if self.label:
             return self.instances[0]
         # First index is occupied by the wildcard element.
         return None
     try:
         return self.instances[index - 1]
     except IndexError:
         print(
             'WARNING:Alphabet get_instance ,unknown instance, return the first label.'
         )
         return self.instances[0]
Esempio n. 8
0
 def decode_nbest(self, word_inputs, feature_inputs, word_seq_lengths,
                  char_inputs, char_seq_lengths, char_seq_recover, mask,
                  nbest):
     if not self.use_crf:
         print("Nbest output is currently supported only for CRF! Exit...")
         exit(0)
     outs = self.word_hidden(word_inputs, feature_inputs, word_seq_lengths,
                             char_inputs, char_seq_lengths,
                             char_seq_recover)
     batch_size = word_inputs.size(0)
     seq_len = word_inputs.size(1)
     scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest)
     return scores, tag_seq
Esempio n. 9
0
def load_model_decode(data, name):
    print("Load Model from file: ", data.model_dir)
    model = SeqModel(data)
    ## load model need consider if the model trained in GPU and load in CPU, or vice versa
    # if not gpu:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
    #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
    # else:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model = torch.load(model_dir)
    model.load_state_dict(torch.load(data.load_model_dir))

    print("Decode %s data, nbest: %s ..." % (name, data.nbest))
    start_time = time.time()
    speed, acc, p, r, f, pred_results, pred_scores = evaluate(
        data, model, name, data.nbest)
    end_time = time.time()
    time_cost = end_time - start_time
    if data.seg:
        print(
            "%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
            % (name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f" %
              (name, time_cost, speed, acc))
    return pred_results, pred_scores
Esempio n. 10
0
 def save(self, output_directory, name=None):
     """
     Save both alhpabet records to the given directory.
     :param output_directory: Directory to save model and weights.
     :param name: The alphabet saving name, optional.
     :return:
     """
     saving_name = name if name else self.__name
     try:
         json.dump(
             self.get_content(),
             open(os.path.join(output_directory, saving_name + ".json"),
                  'w'))
     except Exception as e:
         print("Exception: Alphabet is not saved: " % repr(e))
Esempio n. 11
0
File: crf.py Progetto: qiuwei/NCRFpp
 def __init__(self, tagset_size, gpu):
     super(CRF, self).__init__()
     print("build CRF...")
     self.gpu = gpu
     # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
     self.tagset_size = tagset_size
     # # We add 2 here, because of START_TAG and STOP_TAG
     # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag
     init_transitions = torch.zeros(self.tagset_size + 2,
                                    self.tagset_size + 2)
     init_transitions[:, START_TAG] = -10000.0
     init_transitions[STOP_TAG, :] = -10000.0
     init_transitions[:, 0] = -10000.0
     init_transitions[0, :] = -10000.0
     if self.gpu:
         init_transitions = init_transitions.cuda()
     self.transitions = nn.Parameter(init_transitions)
Esempio n. 12
0
def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"):
    sent_num = len(golden_lists)
    golden_full = []
    predict_full = []
    right_full = []
    right_tag = 0
    all_tag = 0
    for idx in range(0,sent_num):
        # word_list = sentence_lists[idx]
        golden_list = golden_lists[idx]
        predict_list = predict_lists[idx]
        for idy in range(len(golden_list)):
            if golden_list[idy] == predict_list[idy]:
                right_tag += 1
        all_tag += len(golden_list)
        if label_type == "BMES":
            gold_matrix = get_ner_BMES(golden_list)
            pred_matrix = get_ner_BMES(predict_list)
        else:
            gold_matrix = get_ner_BIO(golden_list)
            pred_matrix = get_ner_BIO(predict_list)
        # print "gold", gold_matrix
        # print "pred", pred_matrix
        right_ner = list(set(gold_matrix).intersection(set(pred_matrix)))
        golden_full += gold_matrix
        predict_full += pred_matrix
        right_full += right_ner
    right_num = len(right_full)
    golden_num = len(golden_full)
    predict_num = len(predict_full)
    if predict_num == 0:
        precision = -1
    else:
        precision =  (right_num+0.0)/predict_num
    if golden_num == 0:
        recall = -1
    else:
        recall = (right_num+0.0)/golden_num
    if (precision == -1) or (recall == -1) or (precision+recall) <= 0.:
        f_measure = -1
    else:
        f_measure = 2*precision*recall/(precision+recall)
    accuracy = (right_tag+0.0)/all_tag
    # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy
    print(f"gold_num = {golden_num}, pred_num = {predict_num}, right_num = {right_num}")
    return accuracy, precision, recall, f_measure
Esempio n. 13
0
    def __init__(self, data):
        super(SeqModel, self).__init__()
        self.use_crf = data.use_crf
        print("build network...")
        print("use_char: %s", data.use_char)
        if data.use_char:
            print("char feature extractor: %s ", data.char_feature_extractor)
        print("word feature extractor: %s", data.word_feature_extractor)
        print("use crf: %s", self.use_crf)

        self.gpu = data.HP_gpu
        self.average_batch = data.average_batch_loss
        ## add two more label for downlayer lstm, use original label size for CRF
        label_size = data.label_alphabet_size
        data.label_alphabet_size += 2
        self.word_hidden = WordSequence(data)
        if self.use_crf:
            self.crf = CRF(label_size, self.gpu)
Esempio n. 14
0
    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        ## predict_results : [whole_sent_num, nbest, each_sent_length]
        ## pred_scores: [whole_sent_num, nbest]
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        assert (sent_num == len(pred_scores))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx][0])
            nbest = len(predict_results[idx])
            score_string = "# "
            for idz in range(nbest):
                score_string += format(pred_scores[idx][idz], '.4f') + " "
            fout.write(score_string.strip() + "\n")

            for idy in range(sent_length):
                try:  # Will fail with python3
                    label_string = content_list[idx][0][idy].encode(
                        'utf-8') + " "
                except:
                    label_string = content_list[idx][0][idy] + " "
                for idz in range(nbest):
                    label_string += predict_results[idx][idz][idy] + " "
                label_string = label_string.strip() + "\n"
                fout.write(label_string)
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s" %
              (name, nbest, self.decode_dir))
Esempio n. 15
0
def fmeasure_from_file(golden_file, predict_file, label_type="BMES"):
    print(f"Get f measure from file: {gold_file} {predict_file}")
    print("Label format: %s",label_type)
    golden_sent,golden_labels = readSentence(golden_file)
    predict_sent,predict_labels = readSentence(predict_file)
    P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
    print("P:%sm R:%s, F:%s"%(P,R,F))
Esempio n. 16
0
 def build_pretrain_emb(self):
     if self.word_emb_dir:
         print("Load pretrained word embedding, norm: %s, dir: %s" %
               (self.norm_word_emb, self.word_emb_dir))
         self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
             self.word_emb_dir, self.word_alphabet, self.word_emb_dim,
             self.norm_word_emb)
     if self.char_emb_dir:
         print("Load pretrained char embedding, norm: %s, dir: %s" %
               (self.norm_char_emb, self.char_emb_dir))
         self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
             self.char_emb_dir, self.char_alphabet, self.char_emb_dim,
             self.norm_char_emb)
     for idx in range(self.feature_num):
         if self.feature_emb_dirs[idx]:
             print(
                 "Load pretrained feature %s embedding:, norm: %s, dir: %s"
                 % (self.feature_name[idx], self.norm_feature_embs[idx],
                    self.feature_emb_dirs[idx]))
             self.pretrain_feature_embeddings[idx], self.feature_emb_dims[
                 idx] = build_pretrain_embedding(
                     self.feature_emb_dirs[idx],
                     self.feature_alphabets[idx],
                     self.feature_emb_dims[idx],
                     self.norm_feature_embs[idx])
Esempio n. 17
0
def config_file_to_dict(input_file):
    config = {}
    fins = open(input_file, 'r').readlines()
    for line in fins:
        if len(line) > 0 and line[0] == "#":
            continue
        if "=" in line:
            pair = line.strip().split('#', 1)[0].split('=', 1)
            item = pair[0]
            if item == "feature":
                if item not in config:
                    feat_dict = {}
                    config[item] = feat_dict
                feat_dict = config[item]
                new_pair = pair[-1].split()
                feat_name = new_pair[0]
                one_dict = {}
                one_dict["emb_dir"] = None
                one_dict["emb_size"] = 10
                one_dict["emb_norm"] = False
                if len(new_pair) > 1:
                    for idx in range(1, len(new_pair)):
                        conf_pair = new_pair[idx].split('=')
                        if conf_pair[0] == "emb_dir":
                            one_dict["emb_dir"] = conf_pair[-1]
                        elif conf_pair[0] == "emb_size":
                            one_dict["emb_size"] = int(conf_pair[-1])
                        elif conf_pair[0] == "emb_norm":
                            one_dict["emb_norm"] = str2bool(conf_pair[-1])
                feat_dict[feat_name] = one_dict
                # print "feat",feat_dict
            else:
                if item in config:
                    print(
                        "Warning: duplicated config item found: %s, updated." %
                        (pair[0]))
                config[item] = pair[-1]
    return config
Esempio n. 18
0
 def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim,
              hidden_dim, dropout, gpu):
     super(CharCNN, self).__init__()
     print("build char sequence feature extractor: CNN ...")
     self.gpu = gpu
     self.hidden_dim = hidden_dim
     self.char_drop = nn.Dropout(dropout)
     self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
     if pretrain_char_embedding is not None:
         self.char_embeddings.weight.data.copy_(
             torch.from_numpy(pretrain_char_embedding))
     else:
         self.char_embeddings.weight.data.copy_(
             torch.from_numpy(
                 self.random_embedding(alphabet_size, embedding_dim)))
     self.char_cnn = nn.Conv1d(embedding_dim,
                               self.hidden_dim,
                               kernel_size=3,
                               padding=1)
     if self.gpu:
         self.char_drop = self.char_drop.cuda()
         self.char_embeddings = self.char_embeddings.cuda()
         self.char_cnn = self.char_cnn.cuda()
Esempio n. 19
0
def build_pretrain_embedding(embedding_path,
                             word_alphabet,
                             embedd_dim=100,
                             norm=True):
    embedd_dict = dict()
    if embedding_path != None:
        embedd_dict, embedd_dim = load_pretrain_emb(embedding_path)
    alphabet_size = word_alphabet.size()
    scale = np.sqrt(3.0 / embedd_dim)
    pretrain_emb = np.empty([word_alphabet.size(), embedd_dim])
    perfect_match = 0
    case_match = 0
    not_match = 0
    for word, index in word_alphabet.iteritems():
        if word in embedd_dict:
            if norm:
                pretrain_emb[index, :] = norm2one(embedd_dict[word])
            else:
                pretrain_emb[index, :] = embedd_dict[word]
            perfect_match += 1
        elif word.lower() in embedd_dict:
            if norm:
                pretrain_emb[index, :] = norm2one(embedd_dict[word.lower()])
            else:
                pretrain_emb[index, :] = embedd_dict[word.lower()]
            case_match += 1
        else:
            pretrain_emb[index, :] = np.random.uniform(-scale, scale,
                                                       [1, embedd_dim])
            not_match += 1
    pretrained_size = len(embedd_dict)
    print(
        "Embedding:\n     pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s"
        % (pretrained_size, perfect_match, case_match, not_match,
           (not_match + 0.) / alphabet_size))
    return pretrain_emb, embedd_dim
Esempio n. 20
0
 def initial_feature_alphabets(self):
     items = open(self.train_dir, 'r').readline().strip('\n').split()
     total_column = len(items)
     if total_column > 2:
         for idx in range(1, total_column - 1):
             feature_prefix = items[idx].split(']', 1)[0] + "]"
             self.feature_alphabets.append(Alphabet(feature_prefix))
             self.feature_name.append(feature_prefix)
             print("Find feature: %s", feature_prefix)
     self.feature_num = len(self.feature_alphabets)
     self.pretrain_feature_embeddings = [None] * self.feature_num
     self.feature_emb_dims = [20] * self.feature_num
     self.feature_emb_dirs = [None] * self.feature_num
     self.norm_feature_embs = [False] * self.feature_num
     self.feature_alphabet_sizes = [0] * self.feature_num
     if self.feat_config:
         for idx in range(self.feature_num):
             if self.feature_name[idx] in self.feat_config:
                 self.feature_emb_dims[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_size']
                 self.feature_emb_dirs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_dir']
                 self.norm_feature_embs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_norm']
Esempio n. 21
0
def choose_label(input_file, output_file):
    with open(input_file,'r') as in_file:
        fins = in_file.readlines()
    with open(output_file,'w') as fout:
        for line in fins:
            if len(line) < 3:
                fout.write(line)
            else:
                pairs = line.strip('\n').split(' ')
                fout.write(pairs[0]+" "+ pairs[-1]+"\n")


if __name__ == '__main__':
    '''Convert NER tag schemes among IOB/BIO/BIOES.
        For example: if you want to convert the IOB tag scheme to BIO, then you run as following:
            python NERSchemeConverter.py IOB2BIO input_iob_file output_bio_file
        Input data format is the standard CoNLL 2003 data format.
    '''
    if sys.argv[1].upper() == "IOB2BIO":
        IOB2BIO(sys.argv[2],sys.argv[3])
    elif sys.argv[1].upper() == "BIO2BIOES":
        BIO2BIOES(sys.argv[2],sys.argv[3])
    elif sys.argv[1].upper() == "BIOES2BIO":
        BIOES2BIO(sys.argv[2],sys.argv[3])
    elif sys.argv[1].upper() == "IOB2BIOES":
        IOB2BIO(sys.argv[2],"temp")
        BIO2BIOES("temp",sys.argv[3])
    else:
        print("Argument error: sys.argv[1] should belongs to \"IOB2BIO/BIO2BIOES/BIOES2BIO/IOB2BIOES\"")
Esempio n. 22
0
def load_pretrain_emb(embedding_path):
    embedd_dim = -1
    embedd_dict = dict()
    with open(embedding_path, 'r') as file:
        for line in file:
            if line.startswith(' ') or line.startswith(' '):
                continue
            line = line.strip()
            if len(line) == 0:
                continue
            tokens = line.split()
            if embedd_dim < 0:
                embedd_dim = len(tokens) - 1
            else:
                assert (embedd_dim + 1 == len(tokens)), f"INCORRECT{line}"
            embedd = np.empty([1, embedd_dim])
            embedd[:] = tokens[1:]
            if sys.version_info[0] < 3:
                first_col = tokens[0].decode('utf-8')
            else:
                first_col = tokens[0]
            embedd_dict[first_col] = embedd
    return embedd_dict, embedd_dim


if __name__ == '__main__':
    a = np.arange(9.0)
    print(a)
    print(norm2one(a))
Esempio n. 23
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=data.HP_lr,
                              momentum=data.HP_momentum,
                              weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=data.HP_lr,
                                   weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=data.HP_lr,
                                  weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(0)
    best_dev = -10
    # data.HP_iteration = 1
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx, data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num // batch_size + 1
        for batch_id in range(total_batch):
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(
                instance, data.HP_gpu)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            sample_loss += loss.data[0]
            total_loss += loss.data[0]
            if end % 500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print(
                    "     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"
                    % (end, temp_cost, sample_loss, right_token, whole_token,
                       (right_token + 0.) / whole_token))
                sys.stdout.flush()
                sample_loss = 0
            loss.backward()
            optimizer.step()
            model.zero_grad()
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" %
              (end, temp_cost, sample_loss, right_token, whole_token,
               (right_token + 0.) / whole_token))
        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print(
            "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"
            % (idx, epoch_cost, train_num / epoch_cost, total_loss))
        # continue
        speed, acc, p, r, f, _, _ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f
            print(
                "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" %
                  (dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                print("Exceed previous best f score:", best_dev)
            else:
                print("Exceed previous best acc score:", best_dev)
            model_name = data.model_dir + '.' + str(idx) + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
        # ## decode test
        speed, acc, p, r, f, _, _ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print(
                "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
                % (test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" %
                  (test_cost, speed, acc))
        gc.collect()
Esempio n. 24
0
def lr_decay(optimizer, epoch, decay_rate, init_lr):
    lr = init_lr / (1 + decay_rate * epoch)
    print(f" Learning rate is setted as: {lr}")
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer
Esempio n. 25
0
    parser.add_argument('--train', default="data/conll03/train.bmes")
    parser.add_argument('--dev', default="data/conll03/dev.bmes")
    parser.add_argument('--test', default="data/conll03/test.bmes")
    parser.add_argument('--seg', default="True")
    parser.add_argument('--raw')
    parser.add_argument('--loadmodel')
    parser.add_argument('--output')
    args = parser.parse_args()
    data = Data()

    data.train_dir = args.train
    data.dev_dir = args.dev
    data.test_dir = args.test
    data.model_dir = args.savemodel
    data.dset_dir = args.savedset
    print("aaa", data.dset_dir)
    status = args.status.lower()
    save_model_dir = args.savemodel
    data.HP_gpu = torch.cuda.is_available()
    print("Seed num:", seed_num)
    data.number_normalized = True
    data.word_emb_dir = "../data/glove.6B.100d.txt"

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.use_char = True
        data.HP_batch_size = 10
        data.HP_lr = 0.015
        data.char_seq_feature = "CNN"
        data.generate_instance('train')
Esempio n. 26
0
def fmeasure_from_singlefile(twolabel_file, label_type="BMES", pred_col=-1):
    sent,golden_labels,predict_labels = readTwoLabelSentence(twolabel_file, pred_col)
    P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
    print("P:%s, R:%s, F:%s"%(P,R,F))
Esempio n. 27
0
    def __init__(self, data):
        super(WordSequence, self).__init__()
        print("build word sequence feature extractor: %s..." %
              (data.word_feature_extractor))
        self.gpu = data.HP_gpu
        self.use_char = data.use_char
        # self.batch_size = data.HP_batch_size
        # self.hidden_dim = data.HP_hidden_dim
        self.droplstm = nn.Dropout(data.HP_dropout)
        self.bilstm_flag = data.HP_bilstm
        self.lstm_layer = data.HP_lstm_layer
        self.wordrep = WordRep(data)
        self.input_size = data.word_emb_dim
        if self.use_char:
            self.input_size += data.HP_char_hidden_dim
            if data.char_feature_extractor == "ALL":
                self.input_size += data.HP_char_hidden_dim
        for idx in range(data.feature_num):
            self.input_size += data.feature_emb_dims[idx]
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        if self.bilstm_flag:
            lstm_hidden = data.HP_hidden_dim // 2
        else:
            lstm_hidden = data.HP_hidden_dim

        self.word_feature_extractor = data.word_feature_extractor
        if self.word_feature_extractor == "GRU":
            self.lstm = nn.GRU(self.input_size,
                               lstm_hidden,
                               num_layers=self.lstm_layer,
                               batch_first=True,
                               bidirectional=self.bilstm_flag)
        elif self.word_feature_extractor == "LSTM":
            self.lstm = nn.LSTM(self.input_size,
                                lstm_hidden,
                                num_layers=self.lstm_layer,
                                batch_first=True,
                                bidirectional=self.bilstm_flag)
        elif self.word_feature_extractor == "CNN":
            # cnn_hidden = data.HP_hidden_dim
            self.word2cnn = nn.Linear(self.input_size, data.HP_hidden_dim)
            self.cnn_layer = data.HP_cnn_layer
            print("CNN layer: %s", self.cnn_layer)
            self.cnn_list = nn.ModuleList()
            self.cnn_drop_list = nn.ModuleList()
            self.cnn_batchnorm_list = nn.ModuleList()
            kernel = 3
            pad_size = (kernel - 1) / 2
            for idx in range(self.cnn_layer):
                self.cnn_list.append(
                    nn.Conv1d(data.HP_hidden_dim,
                              data.HP_hidden_dim,
                              kernel_size=kernel,
                              padding=pad_size))
                self.cnn_drop_list.append(nn.Dropout(data.HP_dropout))
                self.cnn_batchnorm_list.append(
                    nn.BatchNorm1d(data.HP_hidden_dim))
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(data.HP_hidden_dim,
                                    data.label_alphabet_size)

        if self.gpu:
            self.droplstm = self.droplstm.cuda()
            self.hidden2tag = self.hidden2tag.cuda()
            if self.word_feature_extractor == "CNN":
                self.word2cnn = self.word2cnn.cuda()
                for idx in range(self.cnn_layer):
                    self.cnn_list[idx] = self.cnn_list[idx].cuda()
                    self.cnn_drop_list[idx] = self.cnn_drop_list[idx].cuda()
                    self.cnn_batchnorm_list[idx] = self.cnn_batchnorm_list[
                        idx].cuda()
            else:
                self.lstm = self.lstm.cuda()
Esempio n. 28
0
    return pred_results, pred_scores


if __name__ == '__main__':
    from utils.logging import logger

    parser = argparse.ArgumentParser(description='Tuning with NCRF++')
    # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
    parser.add_argument('--config', help='Configuration File')

    args = parser.parse_args()
    data = Data()
    data.read_config(args.config)
    # save the config in the model dir
    expr_dir = os.path.dirname(data.model_dir)
    print(f"Experiment inside {expr_dir}")

    if not os.path.exists(expr_dir):
        os.makedirs(expr_dir)

    logger.addHandler(logging.FileHandler(os.path.join(expr_dir, 'log'), 'w'))

    print("Copying your config in the experiment dir")
    shutil.copy(args.config, expr_dir)
    status = data.status.lower()
    data.HP_gpu = torch.cuda.is_available()
    print("Seed num: %s", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
Esempio n. 29
0
    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Char embedding  dir: %s" % (self.char_emb_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))
        print("     Raw    file directory: %s" % (self.raw_dir))
        print("     Dset   file directory: %s" % (self.dset_dir))
        print("     Model  file directory: %s" % (self.model_dir))
        print("     Loadmodel   directory: %s" % (self.load_model_dir))
        print("     Decode file directory: %s" % (self.decode_dir))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding  dir: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s" %
                  (self.feature_alphabets[idx].name,
                   self.norm_feature_embs[idx]))
        print(" " + "++" * 20)
        print(" Model Network:")
        print("     Model        use_crf: %s" % (self.use_crf))
        print("     Model word extractor: %s" % (self.word_feature_extractor))
        print("     Model       use_char: %s" % (self.use_char))
        if self.use_char:
            print("     Model char extractor: %s" %
                  (self.char_feature_extractor))
            print("     Model char_hidden_dim: %s" % (self.HP_char_hidden_dim))
        print(" " + "++" * 20)
        print(" Training:")
        print("     Optimizer: %s" % (self.optimizer))
        print("     Iteration: %s" % (self.HP_iteration))
        print("     BatchSize: %s" % (self.HP_batch_size))
        print("     Average  batch   loss: %s" % (self.average_batch_loss))

        print(" " + "++" * 20)
        print(" Hyperparameters:")

        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper        lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyper         HP_clip: %s" % (self.HP_clip))
        print("     Hyper        momentum: %s" % (self.HP_momentum))
        print("     Hyper              l2: %s" % (self.HP_l2))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper      lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyper          bilstm: %s" % (self.HP_bilstm))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("DATA SUMMARY END.")
        print("++" * 50)
        sys.stdout.flush()
Esempio n. 30
0
    def __init__(self, data):
        super(WordRep, self).__init__()
        print("build word representation...")
        self.gpu = data.HP_gpu
        self.use_char = data.use_char
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.char_all_feature = False
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_feature_extractor == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(),
                                               data.pretrain_char_embedding,
                                               self.char_embedding_dim,
                                               self.char_hidden_dim,
                                               data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "GRU":
                self.char_feature = CharBiGRU(data.char_alphabet.size(),
                                              data.pretrain_char_embedding,
                                              self.char_embedding_dim,
                                              self.char_hidden_dim,
                                              data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "ALL":
                self.char_all_feature = True
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
                self.char_feature_extra = CharBiLSTM(
                    data.char_alphabet.size(), data.pretrain_char_embedding,
                    self.char_embedding_dim, self.char_hidden_dim,
                    data.HP_dropout, self.gpu)
            else:
                print(
                    "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL)."
                )
                exit(0)
        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        self.feature_num = data.feature_num
        self.feature_embedding_dims = data.feature_emb_dims
        self.feature_embeddings = nn.ModuleList()
        for idx in range(self.feature_num):
            self.feature_embeddings.append(
                nn.Embedding(data.feature_alphabets[idx].size(),
                             self.feature_embedding_dims[idx]))
        for idx in range(self.feature_num):
            if data.pretrain_feature_embeddings[idx] is not None:
                self.feature_embeddings[idx].weight.data.copy_(
                    torch.from_numpy(data.pretrain_feature_embeddings[idx]))
            else:
                self.feature_embeddings[idx].weight.data.copy_(
                    torch.from_numpy(
                        self.random_embedding(
                            data.feature_alphabets[idx].size(),
                            self.feature_embedding_dims[idx])))

        if self.gpu:
            self.drop = self.drop.cuda()
            self.word_embedding = self.word_embedding.cuda()
            for idx in range(self.feature_num):
                self.feature_embeddings[idx] = self.feature_embeddings[
                    idx].cuda()