Exemple #1
0
    def __init__(self): 
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_char_emb = True
        self.norm_word_emb = False
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)
        self.word_dict = Word_Trie()
        self.word_alphabet = Alphabet('word')

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.char_emb_dim = 50
        self.word_emb_dim = 50
        self.pretrain_char_embedding = None
        self.pretrain_word_embedding = None
        self.label_size = 0
Exemple #2
0
 def __init__(self):
     self.max_sentence_length = 200
     self.number_normalized = True
     self.norm_char_emb = True
     self.norm_gaz_emb = True
     self.dataset_name = 'msra'
     self.tagscheme = "NoSeg"
     self.char_alphabet = Alphabet('character')
     self.label_alphabet = Alphabet('label', unkflag=False)
     self.gaz_lower = False
     self.gaz = Gazetteer(self.gaz_lower)
     self.gaz_alphabet = Alphabet('gaz')
     self.train_ids = []
     self.dev_ids = []
     self.test_ids = []
     self.train_texts = []
     self.dev_texts = []
     self.test_texts = []
     self.char_emb_dim = 100
     self.gaz_emb_dim = 100
     self.pretrain_char_embedding = None
     self.pretrain_gaz_embedding = None
     self.dev_cut_num = 0
     self.train_cut_num = 0
     self.test_cut_num = 0
     self.cut_num = 0
Exemple #3
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = True
        self.norm_gaz_emb = False
        self.use_single = True
        self.word_alphabet = Alphabet('word')
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower, self.use_single)
        self.gaz_alphabet = Alphabet('gaz')
        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True

        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.train_golds = []
        self.dev_golds = []
        self.test_golds = []
        self.raw_golds = []

        self.word_emb_dim = 50
        self.gaz_emb_dim = 100
        self.gaz_dropout = 0.3
        self.pretrain_word_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_hidden_dim = 100
        self.HP_dropout = 0.3
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0

        self.gpu = False
        self.enty_dropout = 0.3
        # self.cls_mode = 'sigmoid'  # or softmax
        self.cls_mode = 'softmax'
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.norm_gaz_emb = False
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        # self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower)
        self.gaz_alphabet = Alphabet('gaz')
        self.gaz_count = {}
        self.gaz_split = {}
        self.biword_count = {}

        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True
        self.HP_use_count = False

        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.train_split_index = []
        self.dev_split_index = []

        self.use_bigram = True
        self.word_emb_dim = 50
        self.biword_emb_dim = 50
        # self.char_emb_dim = 30
        self.gaz_emb_dim = 50
        # self.gaz_dropout = 0.5
        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        # self.char_alphabet_size = 0
        self.label_alphabet_size = 0
 def __init__(self):
     self.relational_alphabet = Alphabet("Relation",
                                         unkflag=False,
                                         padflag=False)
     self.train_data = None
     self.valid_data = None
     self.test_data = None
Exemple #6
0
 def initial_feature_alphabets(self):
     items = open(self.train_dir, 'r').readline().strip('\n').split()
     print(items)
     total_column = len(items)
     if total_column > 2:
         for idx in range(1, total_column - 1):
             feature_prefix = items[idx].split(']', 1)[0] + "]"
             print("feature_prefix:{}".format(feature_prefix))
             self.feature_alphabets.append(Alphabet(feature_prefix))
             self.feature_name.append(feature_prefix)
             print("Find feature: ", feature_prefix)
     self.feature_num = len(self.feature_alphabets)
     self.pretrain_feature_embeddings = [None] * self.feature_num
     self.feature_emb_dims = [20] * self.feature_num
     self.feature_emb_dirs = [None] * self.feature_num
     self.norm_feature_embs = [False] * self.feature_num
     self.feature_alphabet_sizes = [0] * self.feature_num
     if self.feat_config:
         for idx in range(self.feature_num):
             if self.feature_name[idx] in self.feat_config:
                 self.feature_emb_dims[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_size']
                 self.feature_emb_dirs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_dir']
                 self.norm_feature_embs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_norm']
Exemple #7
0
 def __init__(self):
     self.relational_alphabet = Alphabet("Relation",
                                         unkflag=False,
                                         padflag=False)
     self.train_loader = []
     self.valid_loader = []
     self.test_loader = []
     self.weight = {}
Exemple #8
0
 def __init__(self, data_config_file, alphabet_path, if_train=True):
     if if_train:
         with open(data_config_file, 'r') as rf:
             self.data_config = yaml.load(rf, Loader=yaml.FullLoader)
         # init data file
         mode = self.data_config['mode']
         self.data_file = os.path.join(ROOT_PATH,
                                       self.data_config['data'][mode])
         # init ac tree
         specific_words_file = os.path.join(
             ROOT_PATH, self.data_config['specific_words_file'])
         self.trees = Trees.build_trees(specific_words_file)
         # init alphabet
         self.char_alphabet = Alphabet('char')
         self.intent_alphabet = Alphabet('intent')
         self.label_alphabet = Alphabet('label', label=True)
         self.char_alphabet_size, self.intent_alphabet_size, self.label_alphabet_size = -1, -1, -1
         # pad length
         self.char_max_length = self.data_config['char_max_length']
         # read data file
         with open(self.data_file, 'r') as rf:
             self.corpus = rf.readlines()
         self.build_alphabet(alphabet_path)
         self.texts, self.ids = self.read_instance()
         self.train_texts, self.train_ids, self.dev_texts, self.dev_ids, self.test_texts, self.test_ids = self.sample_split(
         )
     else:  # inference use
         self.char_alphabet = Alphabet('char', keep_growing=False)
         self.intent_alphabet = Alphabet('intent', keep_growing=False)
         self.label_alphabet = Alphabet('label',
                                        label=True,
                                        keep_growing=False)
Exemple #9
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250  #句子最大长度
        self.number_normalized = True  #是否将数字归一化
        self.norm_word_emb = True  #是否将词向量归一化
        self.word_alphabet = Alphabet('word')  #word的词表与id
        self.label_alphabet = Alphabet('label', True)  #not end "</unk>"
        #约定标注方式
        self.tagScheme = "NoSeg"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.word_emb_dim = 50
        self.pretrain_word_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 200
        self.HP_batch_size = 32  # 1
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.3
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_gpu = True  # true
        self.HP_lr = 0.01
        self.HP_lr_decay = 0.05
        self.weight_decay = 0.00000005
        self.use_clip = False
        self.HP_clip = 5.0
        self.HP_momentum = 0  #控制优化器的一个超参
        self.random_seed = 100
Exemple #10
0
# @Last Modified time: 2017-07-15 17:13:30

import sys
import numpy as np
from utils.alphabet import Alphabet
from utils.data_processor import *
from model.Model import *
from utils.keras_utils import padding
from utils.metric import *
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

# from keras.utils.vis_utils import plot_model

word_alphabet = Alphabet('word')
char_alphabet = Alphabet('char')

nb_epoch = 100
use_char = True
mask_zero = True
BILSTM = True
DropProb = 0.2
case_sense = True
batch_size = 128
grad_discent = "adam"
lstm_average = False
label_type = 'BMES'
char_emb_dims = 50
nb_filter = 100
filter_length = 3
Exemple #11
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.norm_trans_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.translation_alphabet = Alphabet('translation')
        self.translation_id_format = {}

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None

        self.trans_dir = None

        self.decode_dir = None
        self.dset_dir = None  ## data vocabulary related file
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.char_emb_dir = None
        self.trans_embed_dir = None

        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_trans_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.trans_alphabet_size = 0

        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.trans_emb_dim = 100

        ###Networks
        self.word_feature_extractor = "LSTM"  ## "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_seq_feature = "CNN"  ## "LSTM"/"CNN"/"GRU"/None
        self.use_trans = True
        self.use_crf = True
        self.nbest = None

        ## Training
        self.average_batch_loss = False
        self.optimizer = "SGD"  ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
        self.status = "train"
        ### Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_trans_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8
Exemple #12
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = False
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        self.pos_alphabet = Alphabet('pos')
        self.label_alphabet = Alphabet('label', True)

        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.use_bigram = False
        self.word_emb_dim = 50
        self.biword_emb_dim = 50

        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        self.label_alphabet_size = 0
        #  hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 16
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.2
        self.HP_lstmdropout = 0
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0

        #  attention
        self.tencent_word_embed_dim = 200
        self.pos_embed_dim = 200
        self.cross_domain = False
        self.cross_test = False
        self.use_san = False
        self.use_cnn = False
        self.use_attention = True
        self.pos_to_idx = {}
        self.external_pos = {}
        self.token_replace_prob = {}
        self.use_adam = False
        self.use_bert = False
        self.use_warmup_adam = False
        self.use_sgd = False
        self.use_adadelta = False
        self.use_window = True
        self.mode = 'train'
        self.use_tencent_dic = False

        # cross domain file
        self.computer_file = ""
        self.finance_file = ""
        self.medicine_file = ""
        self.literature_file = ""
Exemple #13
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.word_alphabet = Alphabet('word')
        self.label = [
            "O", "B-A", "I-A", "B-O", "I-O", "B-E", "I-E", "B-T", "I-T", "B-C",
            "I-C"
        ]
        self.label_alphabet = Alphabet('label', True)
        self.sentence_type_alphabet = Alphabet('sentence', True)
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None

        self.decode_dir = None
        self.dset_dir = None  ## data vocabulary related file
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.word_emb_file = None

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.use_pre_trained_model = None

        self.word_alphabet_size = 0
        self.opinion_label_alphabet_size = 0
        self.evidence_label_alphabet_size = 0
        self.sentence_alphabet_size = 0
        self.word_emb_dim = 50
        self.lstm_input_size = 50

        ###Networks
        self.word_feature_extractor = "LSTM"  ## "LSTM"/"CNN"/"GRU"/
        self.use_crf = True
        self.nbest = None

        ## Training
        self.average_batch_loss = False
        self.optimizer = "SGD"  ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
        self.status = "train"
        ### Hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_hidden_dim = 200
        self.HP_attention_query_input_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8
Exemple #14
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.norm_gaz_emb = False
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)
        #self.simi_alphabet = Alphabet('simi')  #添加计算相似度词语的信息
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower)
        self.gaz_alphabet = Alphabet('gaz')
        self.gaz_count = {}
        self.gaz_split = {}
        self.biword_count = {}

        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True
        self.HP_use_count = False

        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.train_split_index = []
        self.dev_split_index = []

        self.use_bigram = True
        self.word_emb_dim = 200
        self.biword_emb_dim = 200
        self.char_emb_dim = 30
        self.gaz_emb_dim = 200
        self.gaz_dropout = 0.5
        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0

        #设置词典相似度相关的参数
        self.simi_dic_emb = None  #设置相似度的嵌入值
        self.simi_dic_dim = 10  #设置相似度向量的纬度
        self.use_dictionary = False  # 设置当前是否使用词典
        self.simi_list = []  #存储当前的每个字对应的相似度值
        # self.use_gazcount = 'True'

        ### hyperparameters
        self.HP_iteration = 60
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 128
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = False
        self.HP_gpu = True
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0

        self.HP_num_layer = 4
Exemple #15
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.norm_gaz_emb = False
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower)
        self.gaz_alphabet = Alphabet('gaz')
        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True

        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.use_bigram = True
        self.word_emb_dim = 50
        self.biword_emb_dim = 50
        self.char_emb_dim = 30
        self.gaz_emb_dim = 50
        self.gaz_dropout = 0.5
        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = False
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0
Exemple #16
0
    def __init__(self, args):
        super(Data, self).__init__()
        self.args = args
        self.data_dir = args.data_dir  # './data/gene_term_format_by_sentence.json'
        self.data_ratio = (0.9, 0.05, 0.05)  # total 2000
        self.model_save_dir = args.savemodel  # './saves/model/'
        self.output_dir = args.output  # './saves/output/'
        self.data_save_file = args.savedset  # './saves/data/dat.pkl'

        self.pos_as_feature = args.use_pos
        self.use_elmo = args.use_elmo
        self.elmodim = args.elmodim
        self.pos_emb_dim = args.posdim
        self.useSpanLen = args.use_len
        self.use_sentence_att = args.use_sent_att
        self.use_char = True
        self.ranking = 1

        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.ptag_alphabet = Alphabet('tag')
        self.label_alphabet = Alphabet('label', label=True)
        self.seqlabel_alphabet = Alphabet('span_label', label=True)

        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.ptag_alphabet_size = 0
        self.label_alphabet_size = 0
        self.seqlabel_alphabet_size = 0

        self.max_sentence_length = 500

        self.term_truples = []

        self.sent_texts = []
        self.chars = []
        self.lengths = []
        self.ptags = []
        self.seq_labels = []

        self.word_ids_sent = []
        self.char_id_sent = []
        self.tag_ids_sent = []
        self.label_ids_sent = []
        self.seq_labels_ids = []

        self.longSpan = True
        self.shortSpan = True
        self.termratio = args.term_ratio
        self.term_span = args.max_length

        self.word_feature_extractor = "LSTM"  ## "LSTM"/"CNN"/"GRU"/
        self.char_feature_extractor = "CNN"  ## "LSTM"/"CNN"/"GRU"/None

        # training
        self.optimizer = 'Adam'  # "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
        self.training = True
        self.average_batch_loss = True
        self.evaluate_every = args.evaluate_every  # 10 # evaluate every n batches
        self.print_every = args.print_every
        self.silence = True
        self.earlystop = args.early_stop

        # Embeddings
        self.word_emb_dir = args.wordemb  # './data/glove.6B.100d.txt' # None #'../data/glove.6b.100d.txt'
        self.char_emb_dir = args.charemb
        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.spamEm_dim = 30
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None

        # HP
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 100
        self.HP_cnn_layer = 2
        self.HP_batch_size = 100
        self.HP_epoch = 100
        self.HP_lr = args.lr
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_l2 = 1e-8
        self.HP_dropout = args.dropout
        self.HP_lstm_layer = 2
        self.HP_bilstm = True
        self.HP_gpu = args.use_gpu  # False#True
        self.HP_term_span = 6
        self.HP_momentum = 0

        # init data
        self.build_vocabs()
        self.all_instances = self.load_data()
        self.load_pretrain_emb()
Exemple #17
0
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_char_emb = True
        self.norm_bichar_emb = True
        self.norm_gaz_emb = False
        self.use_single = False
        self.char_alphabet = Alphabet('char')
        self.bichar_alphabet = Alphabet('bichar')
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower, self.use_single)
        self.gaz_alphabet = Alphabet('gaz')
        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True

        self.tagScheme = "NoSeg"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.use_bichar = False
        self.char_emb_dim = 50
        self.bichar_emb_dim = 50
        self.gaz_emb_dim = 50
        self.posi_emb_dim = 30
        self.gaz_dropout = 0.5
        self.pretrain_char_embedding = None
        self.pretrain_bichar_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.char_alphabet_size = 0
        self.bichar_alphabet_size = 0
        self.character_alphabet_size = 0
        self.label_alphabet_size = 0
        # hyper parameters
        self.HP_iteration = 100
        self.HP_batch_size = 1
        # self.HP_char_hidden_dim = 50  # int. Character hidden vector dimension for character sequence layer.
        self.HP_hidden_dim = 200  # int. Char hidden vector dimension for word sequence layer.
        self.HP_dropout = 0.5  # float. Dropout probability.
        self.HP_lstm_layer = 1  # int. LSTM layer number for word sequence layer.
        self.HP_bilstm = True  # boolen. If use bidirection lstm for word seuquence layer.
        self.HP_gpu = False
        # Word level LSTM models (e.g. char LSTM + word LSTM + CRF) would prefer a `lr` around 0.015.
        # Word level CNN models (e.g. char LSTM + word CNN + CRF) would prefer a `lr` around 0.005 and with more iterations.
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05  # float. Learning rate decay rate, only works when optimizer=SGD.
        self.HP_clip = 1.0  # float. Clip the gradient which is larger than the setted number.
        self.HP_momentum = 0  # float. Momentum

        self.HP_use_posi = False
        self.HP_num_layer = 4
        self.HP_rethink_iter = 2
        self.model_name = 'CNN_model'
        self.posi_alphabet_size = 0