def __init__(self, config, alphabet: Alphabet, emb_dim, device): super(TextCNN, self).__init__() self.config = config self.embeddings = nn.Embedding(alphabet.size(), emb_dim) # self.embeddings.weight.requires_grad = False if config['train_mode'] == 'static': self.embeddings = self.embeddings.from_pretrained( torch.from_numpy(alphabet.pretrained_emb)) elif config['train_mode'] == 'fine-tuned': self.embeddings.weight.data.copy_( torch.from_numpy(alphabet.pretrained_emb)) filters = config['filters'] self.cnn = nn.ModuleList([ nn.Sequential( nn.Conv1d(1, config['output_channels'], [w, emb_dim]), nn.ReLU(), nn.AdaptiveMaxPool2d(1)) for w in filters ]) self.linear = nn.Linear(config['output_channels'] * len(filters), 2, bias=True) self.dropout = nn.Dropout(config['dropout']) self.relu = nn.ReLU() self.scale = np.sqrt(3.0 / emb_dim) self.apply(self._init_esim_weights)
def load_config_pos(config_path, char_embedd_dim): max_sent_length, max_char_length, num_labels, embedd_dim_concat = load_config(config_path) alphabet_char = Alphabet('char', keep_growing=False) alphabet_char.load(config_path, 'alphabet_char') alphabet_label = Alphabet('label', keep_growing=False) alphabet_label.load(config_path, 'alphabet_label') scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [alphabet_char.size(), char_embedd_dim]).\ astype(theano.config.floatX) return max_sent_length, max_char_length, num_labels, embedd_dim_concat, alphabet_char, alphabet_label, \ char_embedd_table
class Data: def __init__(self, args): # Alphabet self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) # data self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.input_size = 0 self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 # hyper parameters self.HP_word_emb_dim = args.embedding_size self.HP_char_emb_dim = args.char_embedding_size self.HP_iteration = args.max_epoch self.HP_batch_size = args.batch_size self.HP_char_hidden_dim = args.char_hidden_dim self.HP_hidden_dim = args.hidden_size self.HP_dropout = args.dropout self.HP_char_dropout = args.char_dropout self.HP_use_char = True if args.char_encoder else False self.HP_char_features = args.char_encoder self.HP_gpu = torch.cuda.is_available() and args.gpu self.HP_lr = args.lr self.HP_model_name = args.model_name self.HP_encoder_type = args.encoder self.HP_optim = args.optim self.HP_number_normalized = args.number_normalized self.HP_seed = args.seed self.HP_l2 = args.l2 self.HP_kernel_size = args.kernel_size self.HP_kernel_num = args.kernel_num # self.HP_lr_decay = 0.05 # self.HP_clip = None # self.HP_momentum = 0 # self.HP_lstm_layer = 1 # self.HP_bilstm = True def show_data_summary(self): print("DATA SUMMARY START:") print(" Word alphabet size: %s" % self.word_alphabet_size) print(" Char alphabet size: %s" % self.char_alphabet_size) print(" Label alphabet size: %s" % self.label_alphabet_size) print(" Word embedding size: %s" % self.HP_word_emb_dim) print(" Char embedding size: %s" % self.HP_char_emb_dim) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Hyper iteration: %s" % self.HP_iteration) print(" Hyper batch size: %s" % self.HP_batch_size) print(" Hyper lr: %s" % self.HP_lr) print(" Hyper hidden_dim: %s" % self.HP_hidden_dim) print(" Hyper dropout: %s" % self.HP_dropout) print(" Hyper GPU: %s" % self.HP_gpu) print(" Hyper use_char: %s" % self.HP_use_char) if self.HP_use_char: print(" Char_features: %s" % self.HP_char_features) print("DATA SUMMARY END.") sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: line = line.strip() if line: pairs = line.strip().split() label = pairs[0].strip() self.label_alphabet.add(label) for word in pairs[2:]: if self.HP_number_normalized: word = normalize_word(word) self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() def extend_word_char_alphabet(self, input_file_list): """ :param :return: """ old_word_size = self.word_alphabet_size old_char_size = self.char_alphabet_size for input_file in input_file_list: in_lines = open(input_file, 'r').readlines() for line in in_lines: line = line.strip() if line: pairs = line.strip().split() for word in pairs[2:]: if self.HP_number_normalized: word = normalize_word(word) # 如果单词中有数字,变为0 self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() print("Extend word/char alphabet finished!") print(" old word:%s -> new word:%s" % (old_word_size, self.word_alphabet_size)) print(" old char:%s -> new char:%s" % (old_char_size, self.char_alphabet_size)) for input_file in input_file_list: print(" from file:%s" % input_file) def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.HP_number_normalized) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.HP_number_normalized) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.HP_number_normalized) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def build_word_pretrain_emb(self, emb_path): """ 预训练词向量 :param emb_path: :return: """ self.pretrain_word_embedding, self.HP_word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.HP_word_emb_dim) def build_char_pretrain_emb(self, emb_path): """ :param emb_path: :return: """ self.pretrain_char_embedding, self.HP_char_emb_dim = build_pretrain_embedding( emb_path, self.char_alphabet, self.HP_char_emb_dim)
class BinarySource(Source): """ Source for binary classification data in following format: one example per line with feature-value pair separated by separator symbol (' ' by default). E.g.: 1 f1:1.0 f2:1.0 f3:1.0 -1 f2:1.0 f3:1.0 f8:1.0 -1 f1:1.0 f2:1.0 1 f8:1.0 f9:1.0 f10:1.0 """ def __init__(self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@"): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return def _parse(self): """ return parsed line """ sep = self._sep for line in self._stream: line = line.rstrip() items = line.split() cl = items[0] assert cl in [POS_LAB, NEG_LAB] feats = [] if self._bias: feats.append((self._bias_prefix, 1.0)) # implicit bias for s in items[1:]: try: f, v = s.rsplit(sep, 1) v = float(v) feats.append((f, v)) except ValueError: sys.exit( "Datasource error: make sure you use the right datasource format." ) yield (cl, feats) def _populate_alphabet(self): print >> sys.stderr, "Populating feature alphabet... ", self.unlock_alphabet() if self._stream_type == "generator": for i, gen_inst in enumerate(self._stream): # read stream directly sys.stderr.write("%s" % "\b" * len(str(i)) + str(i)) featvals = gen_inst.get_featvals() for (f, _) in featvals: self._feature_alphabet.add(f) else: try: for tag, feats in self._parse(): for f, _ in feats: self._feature_alphabet.add(f) except ValueError: sys.exit( "Datasource error: make sure you use the right data format." ) # rewind stream try: self.rewind() except TypeError: sys.exit("TypeError: make sure rewind() is used only on files.") print >> sys.stderr, " done." print >> sys.stderr, "Number of features: %s" % self._feature_alphabet.size( ) return def unlock_alphabet(self): self._feature_alphabet.unlock() return def lock_alphabet(self): self._feature_alphabet.lock() return def set_alphabet(self, feature_alphabet): self._feature_alphabet = feature_alphabet return def get_alphabet(self): return self._feature_alphabet def get_input(self): for label, feats in self._parse(): yield label, feats def __iter__(self): """ instance generator """ feature_alphabet = self._feature_alphabet assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!" if self._stream_type in ["file", "list"]: for idx, (label, feats) in enumerate(self._parse()): if not feature_alphabet.locked(): # dynamic feature alphabet for (f, _) in feats: feature_alphabet.add(f) instance = self._Instance(idx, label, feats, feature_alphabet) yield instance elif self._stream_type == "generator": for idx, gen_inst in enumerate( self._stream): # read stream directly featvals = gen_inst.get_featvals() label = gen_inst.get_label() if not feature_alphabet.locked(): # dynamic feature alphabet for (f, _) in featvals: feature_alphabet.add(f) instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet) yield instance def size(self): s = len(list(self._stream)) self.rewind() return s
datapoints_test = load_dataponts(opt.test_file) # datapoints_train = load_dataponts('training_instances_debug.txt') # datapoints_test = load_dataponts('test_instances_debug.txt') word_alphabet = Alphabet('word') build_alphabet(word_alphabet, datapoints_train) build_alphabet(word_alphabet, datapoints_test) word_alphabet.close() if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim
class BinarySource( Source ): """ Source for binary classification data in following format: one example per line with feature-value pair separated by separator symbol (' ' by default). E.g.: 1 f1:1.0 f2:1.0 f3:1.0 -1 f2:1.0 f3:1.0 f8:1.0 -1 f1:1.0 f2:1.0 1 f8:1.0 f9:1.0 f10:1.0 """ def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return def _parse( self ): """ return parsed line """ sep = self._sep for line in self._stream: line = line.rstrip() items = line.split() cl = items[0] assert cl in [POS_LAB, NEG_LAB] feats = [] if self._bias: feats.append( (self._bias_prefix, 1.0) ) # implicit bias for s in items[1:]: try: f,v = s.rsplit(sep, 1) v = float(v) feats.append( (f,v) ) except ValueError: sys.exit("Datasource error: make sure you use the right datasource format.") yield ( cl, feats ) def _populate_alphabet( self ): print >> sys.stderr, "Populating feature alphabet... ", self.unlock_alphabet() if self._stream_type == "generator": for i, gen_inst in enumerate(self._stream): # read stream directly sys.stderr.write("%s" %"\b"*len(str(i))+str(i)) featvals = gen_inst.get_featvals() for (f,_) in featvals: self._feature_alphabet.add(f) else: try: for tag,feats in self._parse(): for f,_ in feats: self._feature_alphabet.add( f ) except ValueError: sys.exit("Datasource error: make sure you use the right data format.") # rewind stream try: self.rewind() except TypeError: sys.exit("TypeError: make sure rewind() is used only on files.") print >> sys.stderr, " done." print >> sys.stderr, "Number of features: %s" %self._feature_alphabet.size() return def unlock_alphabet( self ): self._feature_alphabet.unlock() return def lock_alphabet( self ): self._feature_alphabet.lock() return def set_alphabet( self, feature_alphabet ): self._feature_alphabet = feature_alphabet return def get_alphabet( self ): return self._feature_alphabet def get_input( self ): for label,feats in self._parse(): yield label, feats def __iter__( self ): """ instance generator """ feature_alphabet = self._feature_alphabet assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!" if self._stream_type in ["file","list"]: for idx,(label,feats) in enumerate(self._parse()): if not feature_alphabet.locked(): # dynamic feature alphabet for (f,_) in feats: feature_alphabet.add(f) instance = self._Instance(idx, label, feats, feature_alphabet) yield instance elif self._stream_type == "generator": for idx, gen_inst in enumerate(self._stream): # read stream directly featvals = gen_inst.get_featvals() label = gen_inst.get_label() if not feature_alphabet.locked(): # dynamic feature alphabet for (f,_) in featvals: feature_alphabet.add(f) instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet) yield instance def size( self ): s = len(list(self._stream)) self.rewind() return s
if oov == "random" and not fine_tune: logger.info("Close word alphabet.") word_alphabet.close() # read dev data logger.info("Reading data from dev set...") word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = dp.read_conll_sequence_labeling( dev_path, word_alphabet, label_alphabet, word_column, label_column) # close alphabets : by close we mean we cannot add any more words to the word vocabulary. #To DO :change to close this after train set alone word_alphabet.close() label_alphabet.close() # we are doing a -1 because we did not use the zer index. I believe this is to account for unknown word logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) # get maximum length : this is mainly for padding. max_length_train = utils.get_max_length(word_sentences_train) max_length_dev = utils.get_max_length(word_sentences_dev) #max_length_test = utils.get_max_length(word_sentences_test) max_length = min(dp.MAX_LENGTH, max(max_length_train, max_length_dev)) logger.info("Maximum length (i.e max words ) of training set is %d" % max_length_train) logger.info("Maximum length (i.e max words ) of dev set is %d" % max_length_dev) #logger.info("Maximum length (i.e max words ) of test set is %d" % max_length_test) logger.info("Maximum length (i.e max words ) used for training is %d" % max_length) logger.info("Padding training text and lables ...")
class Data: def __init__(self): self.substring_names = ['word', 'pos', 'char', 'bpe', 'word-pos'] self.substring_maxlen = 10 self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.norm_trans_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.translation_alphabet = Alphabet('translation') self.translation_id_format = {} self.feature_names = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### self.task_name = None ### I/O self.data_bin_dir = None self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.middle_dir = None self.viterbi_inputs_model_name = None self.trans_dir = None self.decode_dir = None self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.trans_embed_dir = None self.typeinfo_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_trans_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.trans_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.trans_emb_dim = 100 ###Classification ## Dataset Plus self.substring_dir = None self.bpe_emb_dir = None self.pos_emb_dir = None self.pretrain_bpe_embedding = None self.pretrain_pos_embedding = None self.bpe_emb_dim = 30 self.pos_emb_dim = 30 self.bpe_alphabet_size = 0 self.pos_alphabet_size = 0 self.norm_bpe_emb = False self.norm_pos_emb = False self.bpe_texts = [] self.bpe_Ids = [] self.pos_texts = [] self.pos_Ids = [] self.label_size = 0 self.substring_train_texts = None self.substring_train_Ids = None self.substring_dev_texts = None self.substring_dev_Ids = None self.substring_test_texts = None self.substring_test_Ids = None self.substring_label_alphabet = Alphabet('substring_label', True) ###Networks self.word_feature_extractor = "LSTM" # "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_seq_feature = "CNN" # "LSTM"/"CNN"/"GRU"/None self.use_trans = False self.use_crf = True self.nbest = None self.use_mapping = False self.mapping_func = None # tanh or sigmoid # Training self.save_model = True self.state_training_name = 'default' self.average_batch_loss = False self.optimizer = "SGD" # "SGD"/"Adam" self.status = "train" self.show_loss_per_batch = 100 # Hyperparameters self.seed_num = None self.cnn_layer = 4 self.iteration = 100 self.batch_size = 10 self.char_hidden_dim = 50 self.trans_hidden_dim = 50 self.hidden_dim = 200 self.dropout = 0.5 self.lstm_layer = 1 self.bilstm = True self.gpu = False self.lr = 0.015 self.lr_decay = 0.05 self.clip = None self.momentum = 0 self.l2 = 1e-8 # circul self.circul_time = 4 self.circul_deepth = 2 self.circul_gather_output_mode = "concat" # decode prepare self.decode_prepare_mode = 'example' def init_substring_instance(self): len_names = len(self.substring_names) self.substring_train_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_train_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_dev_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_dev_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_test_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_test_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Trans alphabet size: %s" % (self.trans_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Tran embedding dir: %s" % (self.trans_embed_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Tran embedding size: %s" % (self.trans_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Norm tran emb: %s" % (self.norm_trans_emb)) print("++" * 50) print(" task name: %s" % (self.task_name)) print("++" * 50) print(" Data bin file directory: %s" % (self.data_bin_dir)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Middle file directory: %s" % (self.middle_dir)) print(" viterbi inputs model name: %s" % (self.viterbi_inputs_model_name)) if self.typeinfo_dir: print(" typeinfo directory: %s" % (self.typeinfo_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char_seq_feature: %s" % (self.char_seq_feature)) print(" Model char_hidden_dim: %s" % (self.char_hidden_dim)) if self.use_trans: print(" Model trans_hidden_dim: %s" % (self.trans_hidden_dim)) if self.use_mapping: print(" Model mapping function: %s" % (self.mapping_func)) print(" " + "++" * 20) print(" Training:") print(" show_loss_per_batch: %s" % (self.show_loss_per_batch)) print(" save_model: %s" % (self.save_model)) print(" state_training_name: %s" % (self.state_training_name)) print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.iteration)) print(" BatchSize: %s" % (self.batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper seed_num: %s" % (self.seed_num)) print(" Hyper lr: %s" % (self.lr)) print(" Hyper lr_decay: %s" % (self.lr_decay)) print(" Hyper clip: %s" % (self.clip)) print(" Hyper momentum: %s" % (self.momentum)) print(" Hyper l2: %s" % (self.l2)) print(" Hyper hidden_dim: %s" % (self.hidden_dim)) print(" Hyper dropout: %s" % (self.dropout)) print(" Hyper lstm_layer: %s" % (self.lstm_layer)) print(" Hyper bilstm: %s" % (self.bilstm)) print(" Hyper GPU: %s" % (self.gpu)) print("DATA SUMMARY END.") print("++" * 50) print(" substring dir : %s" % (self.substring_dir)) print(" bpe_emb_dir dir : %s" % (self.bpe_emb_dir)) print(" pos_emb_dir dir : %s" % (self.pos_emb_dir)) print("++" * 50) print(" circul time : %s" % (self.circul_time)) print(" circul deepth : %s" % (self.circul_deepth)) print(" gather output mode : %s" % (self.circul_gather_output_mode)) print("++" * 50) print(" decode prepare mode : %s" % (self.decode_prepare_mode)) print("++" * 50) sys.stdout.flush() def make_substring_label_alphabet(self): for label in self.label_alphabet.instances: label = label.split('-')[-1] self.substring_label_alphabet.add(label) self.substring_label_alphabet.close() def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = 'feature_' + str(idx) self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_names.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): self.feature_emb_dims[idx] = self.feat_config[ self.feature_names[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_names[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_names[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('windows-1252') # word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_alphabet_substring(self, input_file_dir, substring_file_prefix): ## will not read lables input_files = os.listdir(input_file_dir) print input_files for input_file in input_files: plus_feature = '' input_file_name = os.path.split(input_file)[1] if input_file_name.split('.')[0] != substring_file_prefix: continue if 'bpe' in input_file_name: plus_feature = 'bpe' elif 'word' in input_file_name: plus_feature = 'word' if plus_feature == '': continue in_lines = open(input_file_dir + input_file, 'r').readlines() for line in in_lines: if len(line.strip()) > 0: pairs = line.strip().split('\t') words = pairs[0].decode('windows-1252') # word = pairs[0].decode('utf-8') if self.number_normalized: words = normalize_word(words) labels = pairs[-1] for word in words.split(): self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.translation_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.typeinfo_dir: type_info_matrix = [] with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file: type_info_lines = typeinfo_file.readlines() for line in type_info_lines: line = line.rstrip().split() for i, _ in enumerate(line): line[i] = float(line[i]) line = np.array(line) type_info_matrix.append(line) print( "Caculate type info distribution,and concate word and type......" ) cos_res = [] for i, word_embed in enumerate(self.pretrain_word_embedding): word_type_info = [] if i == 0: word_type_info = np.random.random( size=len(type_info_matrix)) cos_res.append(word_type_info) else: for type_info in type_info_matrix: cos_sim = 1 - spatial.distance.cosine( word_embed, type_info) word_type_info.append(cos_sim) cos_res.append(word_type_info) cos_res = np.array(cos_res) cos_res = sigmoid(cos_res) self.pretrain_word_embedding = np.concatenate( [self.pretrain_word_embedding, cos_res], axis=1) print "type info length:{}".format(len(type_info_matrix)) self.word_emb_dim += len(type_info_matrix) print "new word dim is :{}".format(self.word_emb_dim) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) if self.trans_embed_dir: print("Load pretrained trans embedding, norm: %s, dir: %s" % (self.norm_trans_emb, self.trans_embed_dir)) self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding( self.trans_embed_dir, self.translation_alphabet, self.trans_emb_dim, self.norm_trans_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_substring(self, substring_file_prefix): self.init_substring_instance() self.make_substring_label_alphabet() input_files = os.listdir(self.substring_dir) print input_files for input_file in input_files: input_file_name = os.path.split(input_file)[1] input_file_dir = os.path.join(self.substring_dir, input_file_name) input_file_name_split = input_file_name.split('.') if input_file_name_split[0] != substring_file_prefix: continue print('dealing %s' % (input_file_name)) name = input_file_name_split[1] feature_name = input_file_name_split[2] f_l = int(input_file_name_split[-1][3:]) #feature_len if feature_name == 'word': alphabet = self.word_alphabet elif feature_name == 'char': alphabet = self.char_alphabet elif feature_name == 'pos': alphabet = self.feature_alphabets[0] elif feature_name == 'bpe': alphabet = self.feature_alphabets[1] s_f_id = self.substring_names.index( feature_name) #substring_feature_id if name == "train": self.substring_train_texts[s_f_id][f_l], self.substring_train_Ids[s_f_id][f_l]\ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) elif name == "testa": self.substring_dev_texts[s_f_id][f_l], self.substring_dev_Ids[s_f_id][f_l] \ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) elif name == "testb": self.substring_test_texts[s_f_id][f_l], self.substring_test_Ids[s_f_id][f_l] \ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) else: print( "Error: you can only generate train/testa/testb instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) assert (sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f') + " " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy] + " " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## task: the_item = 'task_name' if the_item in config: self.task_name = config[the_item] ## read data: the_item = 'data_bin_dir' if the_item in config: self.data_bin_dir = config[the_item] the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'trans_dir' if the_item in config: self.trans_dir = config[the_item] the_item = 'middle_dir' if the_item in config: self.middle_dir = config[the_item] the_item = 'viterbi_inputs_model_name' if the_item in config: self.viterbi_inputs_model_name = config[the_item] the_item = 'substring_dir' if the_item in config: self.substring_dir = config[the_item] the_item = 'bpe_emb_dir' if the_item in config: self.bpe_emb_dir = config[the_item] the_item = 'pos_emb_dir' if the_item in config: self.pos_emb_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'trans_embed_dir' if the_item in config: self.trans_embed_dir = config[the_item] the_item = 'typeinfo_dir' if the_item in config: self.typeinfo_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'trans_emb_dim' if the_item in config: self.trans_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'use_trans' if the_item in config: self.use_trans = str2bool(config[the_item]) the_item = 'use_mapping' if the_item in config: self.use_mapping = str2bool(config[the_item]) the_item = 'mapping_func' if the_item in config: self.mapping_func = config[the_item] the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_seq_feature = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'save_model' if the_item in config: self.save_model = str2bool(config[the_item]) the_item = 'state_training_name' if the_item in config: self.state_training_name = config[the_item] the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] the_item = 'show_loss_per_batch' if the_item in config: self.show_loss_per_batch = int(config[the_item]) ## read Hyperparameters: the_item = 'seed_num' if the_item in config: if config[the_item] != 'None': self.seed_num = int(config[the_item]) the_item = 'cnn_layer' if the_item in config: self.cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.char_hidden_dim = int(config[the_item]) the_item = 'trans_hidden_dim' if the_item in config: self.trans_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: if config[the_item] == 'None': self.clip = None else: self.clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.l2 = float(config[the_item]) ###base2 the_item = 'feature_name' if the_item in config: self.feature_name = config[the_item] the_item = 'feature_length' if the_item in config: self.feature_length = int(config[the_item]) the_item = 'class_num' if the_item in config: self.class_num = int(config[the_item]) the_item = 'feature_ans' if the_item in config: self.feature_ans = config[the_item] ###circul the_item = 'circul_time' if the_item in config: self.circul_time = config[the_item] the_item = 'circul_deepth' if the_item in config: self.circul_deepth = config[the_item] the_item = 'circul_gather_output_mode' if the_item in config: self.circul_gather_output_mode = config[the_item] ###decode_prepare the_item = 'decode_prepare_mode' if the_item in config: self.decode_prepare_mode = config[the_item] def read_arg(self, args): if args.task_name != None: self.task_name = args.task_name if args.data_bin_dir != None: self.data_bin_dir = args.data_bin_dir if args.train_dir != None: self.train_dir = args.train_dir if args.dev_dir != None: self.dev_dir = args.dev_dir if args.test_dir != None: self.test_dir = args.test_dir if args.trans_dir != None: self.trans_dir = args.trans_dir if args.word_emb_dir != None: self.word_emb_dir = args.word_emb_dir if args.trans_embed_dir != None: self.trans_embed_dir = args.trans_embed_dir if args.middle_dir != None: self.middle_dir = args.middle_dir if args.viterbi_inputs_model_name != None: self.viterbi_inputs_model_name = args.viterbi_inputs_model_name if args.substring_dir != None: self.substring_dir = args.substring_dir if args.bpe_emb_dir != None: self.bpe_emb_dir = args.bpe_emb_dir if args.pos_emb_dir != None: self.pos_emb_dir = args.pos_emb_dir if args.model_dir != None: self.model_dir = args.model_dir if args.norm_word_emb != None: self.norm_word_emb = args.norm_word_emb if args.norm_char_emb != None: self.norm_char_emb = args.norm_char_emb if args.word_emb_dim != None: self.word_emb_dim = args.word_emb_dim if args.char_emb_dim != None: self.char_emb_dim = args.char_emb_dim if args.trans_emb_dim != None: self.trans_emb_dim = args.trans_emb_dim if args.number_normalized != None: self.number_normalized = args.number_normalized if args.seg != None: self.seg = args.seg if args.use_crf != None: self.use_crf = args.use_crf if args.use_char != None: self.use_char = args.use_char if args.use_trans != None: self.use_trans = args.use_trans if args.word_seq_feature != None: self.word_seq_feature = args.word_seq_feature if args.char_seq_feature != None: self.char_seq_feature = args.char_seq_feature if args.nbest != None: self.nbest = args.nbest if args.status != None: self.status = args.status if args.state_training_name != None: self.state_training_name = args.state_training_name if args.save_model != None: self.save_model = args.save_model if args.optimizer != None: self.optimizer = args.optimizer if args.iteration != None: self.iteration = args.iteration if args.batch_size != None: self.batch_size = args.batch_size if args.ave_batch_loss != None: self.ave_batch_loss = args.ave_batch_loss if args.show_loss_per_batch != None: self.show_loss_per_batch = args.show_loss_per_batch if args.seed_num != None: self.seed_num = args.seed_num if args.cnn_layer != None: self.cnn_layer = args.cnn_layer if args.char_hidden_dim != None: self.char_hidden_dim = args.char_hidden_dim if args.trans_hidden_dim != None: self.trans_hidden_dim = args.trans_hidden_dim if args.hidden_dim != None: self.hidden_dim = args.hidden_dim if args.dropout != None: self.dropout = args.dropout if args.lstm_layer != None: self.lstm_layer = args.lstm_layer if args.bilstm != None: self.bilstm = args.bilstm if args.learning_rate != None: self.learning_rate = args.learning_rate if args.lr_decay != None: self.lr_decay = args.lr_decay if args.momentum != None: self.momentum = args.momentum if args.l2 != None: self.l2 = args.l2 if args.gpu != None: self.gpu = args.gpu if args.clip != None: self.clip = args.clip ###base2 if args.feature_name != None: self.feature_name = args.feature_name if args.feature_length != None: self.feature_length = args.feature_length if args.class_num != None: self.class_num = args.class_num if args.feature_ans != None: self.feature_ans = args.feature_ans ###circul if args.circul_time != None: self.circul_time = args.circul_time if args.circul_deepth != None: self.circul_deepth = args.circul_deepth if args.circul_gather_output_mode != None: self.circul_gather_output_mode = args.circul_gather_output_mode ###decode_prepare if args.decode_prepare_mode != None: self.decode_prepare_mode = args.decode_prepare_mode def build_translation_alphabet(self, trans_path): print("Creating translation alphabet......") with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: if len(line.strip().split(":")) == 2: temp = line.strip().split(":", 1) words = temp[1].split() for word in words: self.translation_alphabet.add(word.strip()) self.trans_alphabet_size = self.translation_alphabet.size() def build_translation_dict(self, trans_path): print("Creating Id to Id translation dictionary......") translation_id_format_temp = {} with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: ids = [] if len(line.strip().split(":", 1)) == 2: temp = line.strip().split(":", 1) word_id = self.word_alphabet.get_index(temp[0].strip()) translations = temp[1].split() for translation in translations: ids.append( self.translation_alphabet.get_index( translation.strip())) if ids == []: ids = [0] translation_id_format_temp[word_id] = ids for word in self.word_alphabet.instances: if self.word_alphabet.get_index( word) in translation_id_format_temp.keys(): self.translation_id_format[self.word_alphabet.get_index( word)] = translation_id_format_temp[ self.word_alphabet.get_index(word)] else: self.translation_id_format[self.word_alphabet.get_index( word)] = [0]
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("train the vsm-based normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) logging.info("build alphabet ...") word_alphabet = Alphabet('word') norm_utils.build_alphabet_from_dict(word_alphabet, dictionary, isMeddra_dict) norm_utils.build_alphabet(word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(word_alphabet, dev_data) if opt.test_file: norm_utils.build_alphabet(word_alphabet, test_data) norm_utils.fix_alphabet(word_alphabet) logging.info("alphabet size {}".format(word_alphabet.size())) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim dict_alphabet = Alphabet('dict') norm_utils.init_dict_alphabet(dict_alphabet, dictionary) norm_utils.fix_alphabet(dict_alphabet) logging.info("init_vector_for_dict") poses, poses_lengths = init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) vsm_model = VsmNormer(word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses, poses_lengths) logging.info("generate instances for training ...") train_X = [] train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = generate_instances(doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = generate_instances_ehr(doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) train_X.extend(temp_X) train_Y.extend(temp_Y) train_loader = DataLoader(MyDataset(train_X, train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) optimizer = optim.Adam(vsm_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(vsm_model.word_embedding) if d.config['norm_vsm_pretrain'] == '1': dict_pretrain(dictionary, dictionary_reverse, d, isMeddra_dict, optimizer, vsm_model) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() vsm_model.train() train_iter = iter(train_loader) num_iter = len(train_loader) sum_loss = 0 correct, total = 0, 0 for i in range(num_iter): x, lengths, y = next(train_iter) l, y_pred = vsm_model.forward_train(x, lengths, y) sum_loss += l.item() l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(vsm_model.parameters(), opt.gradient_clip) optimizer.step() vsm_model.zero_grad() total += y.size(0) _, pred = torch.max(y_pred, 1) correct += (pred == y).sum().item() epoch_finish = time.time() accuracy = 100.0 * correct / total logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f Accuracy %.2f" % (idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy)) if opt.dev_file: p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, vsm_model, None, None, d, isMeddra_dict) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if fold_idx is None: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) else: torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") if len(opt.dev_file) == 0: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) return best_dev_p, best_dev_r, best_dev_f
def load_dataset_sequence_labeling(train_path, dev_path, test_path, word_column=0, label_column=1, label_name='senti', oov='embedding', fine_tune=False, embedding="word2Vec", embedding_path=None, use_character=False): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param label_column: the column of label (start from 0) :param label_name: name of label, such as pos or ner :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and test data to alphabet; if "random", not. :param fine_tune: if fine tune word embeddings. :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :param use_character: if use character embeddings. :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor_fine_tune(word_index_sentences, label_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Y = [] mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] label_ids = label_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] X[i, j] = wid label = label_ids[0] Y.append(label) # Zero out X after the end of the sequence X[i, length:] = 0 # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, Y, mask def generate_dataset_fine_tune(): """ generate data tensor when fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length]) X_train, Y_train, mask_train = construct_tensor_fine_tune( word_index_sentences_train, label_index_sentences_train) X_dev, Y_dev, mask_dev = construct_tensor_fine_tune( word_index_sentences_dev, label_index_sentences_dev) X_test, Y_test, mask_test = construct_tensor_fine_tune( word_index_sentences_test, label_index_sentences_test) C_train, C_dev, C_test, char_embedd_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else (None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless), label_alphabet, \ C_train, C_dev, C_test, char_embedd_table def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, embedd_dict, embedd_dim, caseless): X = np.empty([len(word_sentences), max_length, embedd_dim], dtype=theano.config.floatX) Y = np.empty([len(word_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX) # bad_dict = dict() # bad_num = 0 for i in range(len(word_sentences)): words = word_sentences[i] label_ids = label_index_sentences[i] length = len(words) for j in range(length): word = words[j].lower() if caseless else words[j] label = label_ids[j] embedd = embedd_dict[ word] if word in embedd_dict else unknown_embedd X[i, j, :] = embedd Y[i, j] = label - 1 # if word not in embedd_dict: # bad_num += 1 # if word in bad_dict: # bad_dict[word] += 1 # else: # bad_dict[word] = 1 # Zero out X after the end of the sequence X[i, length:] = np.zeros([1, embedd_dim], dtype=theano.config.floatX) # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 # for w, c in bad_dict.items(): # if c >= 100: # print "%s: %d" % (w, c) # print bad_num return X, Y, mask def generate_dataset_not_fine_tune(): """ generate data tensor when not fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length]) unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim]) X_train, Y_train, mask_train = construct_tensor_not_fine_tune( word_sentences_train, label_index_sentences_train, unknown_embedd, embedd_dict, embedd_dim, caseless) X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune( word_sentences_dev, label_index_sentences_dev, unknown_embedd, embedd_dict, embedd_dim, caseless) X_test, Y_test, mask_test = construct_tensor_not_fine_tune( word_sentences_test, label_index_sentences_test, unknown_embedd, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else (None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ None, label_alphabet, C_train, C_dev, C_test, char_embedd_table word_alphabet = Alphabet('word') label_alphabet = Alphabet(label_name) # read training data logger.info("Reading data from training set...") word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling( train_path, word_alphabet, label_alphabet, word_column, label_column) # if oov is "random" and do not fine tune, close word_alphabet if oov == "random" and not fine_tune: logger.info("Close word alphabet.") word_alphabet.close() # read dev data logger.info("Reading data from dev set...") word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling( dev_path, word_alphabet, label_alphabet, word_column, label_column) # read test data logger.info("Reading data from test set...") word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling( test_path, word_alphabet, label_alphabet, word_column, label_column) # close alphabets word_alphabet.close() label_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) if fine_tune: logger.info("Generating data with fine tuning...") return generate_dataset_fine_tune() else: logger.info("Generating data without fine tuning...") return generate_dataset_not_fine_tune()
def create_alphabets(alphabet_directory, data_paths, max_vocabulary_size, normalize_digits=True): logger = utils.get_logger("Create Alphabets") word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not gfile.Exists(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) vocab = dict() for data_path in data_paths: logger.info("Processing data: %s" % data_path) with gfile.GFile(data_path, mode="r") as file: for line in file: line = line.decode('utf-8') line = line.strip() if len(line) == 0: continue tokens = line.split() word = DIGIT_RE.sub( b"0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] for word in vocab_list: word_alphabet.add(word) word_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() pos_alphabet.close() type_alphabet.close() return word_alphabet, pos_alphabet, type_alphabet
def load_dataset_sequence_labeling(train_path, dev_path, test_path, word_column=1, label_column=4, label_name='pos', oov='embedding', fine_tune=False, embedding="word2Vec", embedding_path=None, use_character=False): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param label_column: the column of label (start from 0) :param label_name: name of label, such as pos or ner :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and test data to alphabet; if "random", not. :param fine_tune: if fine tune word embeddings. :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :param use_character: if use character embeddings. :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table """ def get_max_length(word_sentences): max_len = 0 for sentence in word_sentences: length = len(sentence) if length > max_len: max_len = length return max_len def construct_tensor_fine_tune(word_index_sentences, label_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] label_ids = label_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] label = label_ids[j] X[i, j] = wid Y[i, j] = label - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, Y, mask def build_embedd_table(embedd_dict, embedd_dim, caseless): scale = np.sqrt(3.0 / embedd_dim) embedd_table = np.empty([word_alphabet.size(), embedd_dim], dtype=theano.config.floatX) embedd_table[word_alphabet.default_index, :] = np.random.uniform(-scale, scale, [1, embedd_dim]) for word, index in word_alphabet.iteritems(): ww = word.lower() if caseless else word embedd = embedd_dict[ww] if ww in embedd_dict else np.random.uniform(-scale, scale, [1, embedd_dim]) embedd_table[index, :] = embedd return embedd_table def generate_dataset_fine_tune(): """ generate data tensor when fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length]) X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train, label_index_sentences_train) X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev) X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ build_embedd_table(embedd_dict, embedd_dim, caseless), label_alphabet, \ C_train, C_dev, C_test, char_embedd_table def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, embedd_dict, embedd_dim, caseless): X = np.empty([len(word_sentences), max_length, embedd_dim], dtype=theano.config.floatX) Y = np.empty([len(word_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX) # bad_dict = dict() # bad_num = 0 for i in range(len(word_sentences)): words = word_sentences[i] label_ids = label_index_sentences[i] length = len(words) for j in range(length): word = words[j].lower() if caseless else words[j] label = label_ids[j] embedd = embedd_dict[word] if word in embedd_dict else unknown_embedd X[i, j, :] = embedd Y[i, j] = label - 1 # if word not in embedd_dict: # bad_num += 1 # if word in bad_dict: # bad_dict[word] += 1 # else: # bad_dict[word] = 1 # Zero out X after the end of the sequence X[i, length:] = np.zeros([1, embedd_dim], dtype=theano.config.floatX) # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 # for w, c in bad_dict.items(): # if c >= 100: # print "%s: %d" % (w, c) # print bad_num return X, Y, mask def generate_dataset_not_fine_tune(): """ generate data tensor when not fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length]) unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim]) X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train, label_index_sentences_train, unknown_embedd, embedd_dict, embedd_dim, caseless) X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev, unknown_embedd, embedd_dict, embedd_dim, caseless) X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test, unknown_embedd, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ None, label_alphabet, C_train, C_dev, C_test, char_embedd_table word_alphabet = Alphabet('word') label_alphabet = Alphabet(label_name) # read training data logger.info("Reading data from training set...") word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling( train_path, word_alphabet, label_alphabet, word_column, label_column) # if oov is "random" and do not fine tune, close word_alphabet if oov == "random" and not fine_tune: logger.info("Close word alphabet.") word_alphabet.close() # read dev data logger.info("Reading data from dev set...") word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling( dev_path, word_alphabet, label_alphabet, word_column, label_column) # read test data logger.info("Reading data from test set...") word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling( test_path, word_alphabet, label_alphabet, word_column, label_column) # close alphabets word_alphabet.close() label_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) if fine_tune: logger.info("Generating data with fine tuning...") return generate_dataset_fine_tune() else: logger.info("Generating data without fine tuning...") return generate_dataset_not_fine_tune()
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("train the ensemble normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) logging.info("build alphabet ...") word_alphabet = Alphabet('word') norm_utils.build_alphabet_from_dict(word_alphabet, dictionary, isMeddra_dict) norm_utils.build_alphabet(word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(word_alphabet, dev_data) if opt.test_file: norm_utils.build_alphabet(word_alphabet, test_data) norm_utils.fix_alphabet(word_alphabet) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim dict_alphabet = Alphabet('dict') norm_utils.init_dict_alphabet(dict_alphabet, dictionary) norm_utils.fix_alphabet(dict_alphabet) # rule logging.info("init rule-based normer") multi_sieve.init(opt, train_data, d, dictionary, dictionary_reverse, isMeddra_dict) if opt.ensemble == 'learn': logging.info("init ensemble normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) ensemble_model = Ensemble(word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses) if pretrain_neural_model is not None: ensemble_model.neural_linear.weight.data.copy_( pretrain_neural_model.linear.weight.data) if pretrain_vsm_model is not None: ensemble_model.vsm_linear.weight.data.copy_( pretrain_vsm_model.linear.weight.data) ensemble_train_X = [] ensemble_train_Y = [] for doc in train_data: temp_X, temp_Y = generate_instances(doc, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict) ensemble_train_X.extend(temp_X) ensemble_train_Y.extend(temp_Y) ensemble_train_loader = DataLoader(MyDataset(ensemble_train_X, ensemble_train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(ensemble_model.word_embedding) else: # vsm logging.info("init vsm-based normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) # alphabet can share between vsm and neural since they don't change # but word_embedding cannot vsm_model = vsm.VsmNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet, poses) vsm_train_X = [] vsm_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = vsm.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = vsm.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) vsm_train_X.extend(temp_X) vsm_train_Y.extend(temp_Y) vsm_train_loader = DataLoader(vsm.MyDataset(vsm_train_X, vsm_train_Y), opt.batch_size, shuffle=True, collate_fn=vsm.my_collate) vsm_optimizer = optim.Adam(vsm_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(vsm_model.word_embedding) if d.config['norm_vsm_pretrain'] == '1': vsm.dict_pretrain(dictionary, dictionary_reverse, d, True, vsm_optimizer, vsm_model) # neural logging.info("init neural-based normer") neural_model = norm_neural.NeuralNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet) neural_train_X = [] neural_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = norm_neural.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = norm_neural.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) neural_train_X.extend(temp_X) neural_train_Y.extend(temp_Y) neural_train_loader = DataLoader(norm_neural.MyDataset( neural_train_X, neural_train_Y), opt.batch_size, shuffle=True, collate_fn=norm_neural.my_collate) neural_optimizer = optim.Adam(neural_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(neural_model.word_embedding) if d.config['norm_neural_pretrain'] == '1': neural_model.dict_pretrain(dictionary, dictionary_reverse, d, True, neural_optimizer, neural_model) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() if opt.ensemble == 'learn': ensemble_model.train() ensemble_train_iter = iter(ensemble_train_loader) ensemble_num_iter = len(ensemble_train_loader) for i in range(ensemble_num_iter): x, rules, lengths, y = next(ensemble_train_iter) y_pred = ensemble_model.forward(x, rules, lengths) l = ensemble_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(), opt.gradient_clip) ensemble_optimizer.step() ensemble_model.zero_grad() else: vsm_model.train() vsm_train_iter = iter(vsm_train_loader) vsm_num_iter = len(vsm_train_loader) for i in range(vsm_num_iter): x, lengths, y = next(vsm_train_iter) l, _ = vsm_model.forward_train(x, lengths, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(vsm_model.parameters(), opt.gradient_clip) vsm_optimizer.step() vsm_model.zero_grad() neural_model.train() neural_train_iter = iter(neural_train_loader) neural_num_iter = len(neural_train_loader) for i in range(neural_num_iter): x, lengths, y = next(neural_train_iter) y_pred = neural_model.forward(x, lengths) l = neural_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(neural_model.parameters(), opt.gradient_clip) neural_optimizer.step() neural_model.zero_grad() epoch_finish = time.time() logging.info("epoch: %s training finished. Time: %.2fs" % (idx, epoch_finish - epoch_start)) if opt.dev_file: if opt.ensemble == 'learn': # logging.info("weight w1: %.4f, w2: %.4f, w3: %.4f" % (ensemble_model.w1.data.item(), ensemble_model.w2.data.item(), ensemble_model.w3.data.item())) p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, None, None, ensemble_model, d, isMeddra_dict) else: p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, vsm_model, neural_model, None, d, isMeddra_dict) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if opt.ensemble == 'learn': if fold_idx is None: torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save( ensemble_model, os.path.join(opt.output, "ensemble_{}.pkl".format(fold_idx + 1))) else: if fold_idx is None: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) else: torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) torch.save( neural_model, os.path.join(opt.output, "norm_neural_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") if fold_idx is None: multi_sieve.finalize(True) else: if fold_idx == opt.cross_validation - 1: multi_sieve.finalize(True) else: multi_sieve.finalize(False) if len(opt.dev_file) == 0: if opt.ensemble == 'learn': torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) return best_dev_p, best_dev_r, best_dev_f
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7, embedding="word2Vec", embedding_path=None): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param pos_column: the column index of pos (start from 0) :param head_column: the column index of head (start from 0) :param type_column: the column index of types (start from 0) :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :return: X_train, POS_train, Head_train, Type_train, mask_train, X_dev, POS_dev, Head_dev, Type_dev, mask_dev, X_test, POS_test, Head_test, Type_test, mask_test, embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] pos_ids = pos_index_sentences[i] heads = head_sentences[i] type_ids = type_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] pid = pos_ids[j] head = heads[j] tid = type_ids[j] X[i, j] = wid POS[i, j] = pid - 1 Head[i, j] = head Type[i, j] = tid - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence POS[i, length:] = POS[i, length - 1] Head[i, length:] = Head[i, length - 1] Type[i, length:] = Type[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, POS, Head, Type, mask word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') # read training data logger.info("Reading data from training set...") word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \ word_index_sentences_train, pos_index_sentences_train, \ type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read dev data logger.info("Reading data from dev set...") word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \ word_index_sentences_dev, pos_index_sentences_dev, \ type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read test data logger.info("Reading data from test set...") word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \ word_index_sentences_test, pos_index_sentences_test, \ type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # close alphabets word_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1)) logger.info("type alphabet size: %d" % (type_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length]) X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(word_index_sentences_train, pos_index_sentences_train, head_sentences_train, type_index_sentences_train) X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(word_index_sentences_dev, pos_index_sentences_dev, head_sentences_dev, type_index_sentences_dev) X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(word_index_sentences_test, pos_index_sentences_test, head_sentences_test, type_index_sentences_test) embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) return X_train, POS_train, Head_train, Type_train, mask_train, \ X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \ X_test, POS_test, Head_test, Type_test, mask_test, \ embedd_table, word_alphabet, pos_alphabet, type_alphabet, \ C_train, C_dev, C_test, char_embedd_table
def loadDataForSequenceLabeling(train_path, dev_path, test_path, char_emb_dim, word_column=0, label_column=3, label_name='pos', oov='embedding', fine_tune=False, embeddingToUse="glove", embedding_path=None, use_character=True): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param label_column: the column of label (start from 0) :param label_name: name of label, such as pos or ner :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and test data to alphabet; if "random", not. :param fine_tune: if fine tune word embeddings. :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :param use_character: if use character embeddings. :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor_fine_tune(word_index_sentences, label_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] label_ids = label_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] label = label_ids[j] X[i, j] = wid Y[i, j] = label - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, Y, mask def construct_orth_tensor_fine_tune(orth_word_index_sentences): X = np.empty([len(orth_word_index_sentences), max_length], dtype=np.int32) for i in range(len(orth_word_index_sentences)): orth_word_ids = orth_word_index_sentences[i] length = len(orth_word_ids) for j in range(length): wid = orth_word_ids[j] X[i, j] = wid # Zero out X after the end of the sequence X[i, length:] = 0 return X def generateDatasetFineTune(): """ generate data tensor when fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size """ word_emb_dict, word_emb_dim, caseless = utils.loadEmbeddingsFromFile( embeddingToUse, embedding_path, word_alphabet, logger) # TODO add a cmd line arg for this orth_word_emb_dict, orth_word_emb_dim = \ utils.randomlyInitialiseOrthographicEmbeddings(orth_word_alphabet, logger, 200) logger.info("Dimension of embedding is %d, Caseless: %d" % (word_emb_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], # Y.shape = [#data, max_length]) X_train, Y_train, mask_train = construct_tensor_fine_tune( word_index_sentences_train, label_index_sentences_train) X_train_orth = construct_orth_tensor_fine_tune( orth_word_index_sentences_train) X_dev, Y_dev, mask_dev = construct_tensor_fine_tune( word_index_sentences_dev, label_index_sentences_dev) X_dev_orth = construct_orth_tensor_fine_tune( orth_word_index_sentences_dev) X_test, Y_test, mask_test = construct_tensor_fine_tune( word_index_sentences_test, label_index_sentences_test) X_test_orth = construct_orth_tensor_fine_tune( orth_word_index_sentences_test) C_train, C_dev, C_test, char_emb_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length, "char", 30) if use_character else \ (None, None, None, None) orth_C_train, orth_C_dev, orth_C_test, orth_char_emb_table = \ generate_character_data(orth_word_sentences_train, orth_word_sentences_dev, orth_word_sentences_test, max_length, "orth_char", 30) if use_character else \ (None, None, None, None) word_emb_table = build_embedd_table(word_alphabet, word_emb_dict, word_emb_dim, caseless) orth_word_emb_table = build_embedd_table(orth_word_alphabet, orth_word_emb_dict, orth_word_emb_dim, False) return X_train, Y_train, mask_train, X_train_orth, \ X_dev, Y_dev, mask_dev, X_dev_orth, \ X_test, Y_test, mask_test, X_test_orth, \ word_emb_table, word_alphabet, orth_word_emb_table, \ label_alphabet, \ C_train, C_dev, C_test, char_emb_table, \ orth_C_train, orth_C_dev, orth_C_test, orth_char_emb_table def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, word_emb_dict, word_emb_dim, caseless): X = np.empty([len(word_sentences), max_length, word_emb_dim], dtype=theano.config.floatX) Y = np.empty([len(word_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX) # bad_dict = dict() # bad_num = 0 for i in range(len(word_sentences)): words = word_sentences[i] label_ids = label_index_sentences[i] length = len(words) for j in range(length): word = words[j].lower() if caseless else words[j] label = label_ids[j] embedd = word_emb_dict[word] if word in word_emb_dict \ else unknown_embedd X[i, j, :] = embedd Y[i, j] = label - 1 # if word not in word_emb_dict: # bad_num += 1 # if word in bad_dict: # bad_dict[word] += 1 # else: # bad_dict[word] = 1 # Zero out X after the end of the sequence X[i, length:] = np.zeros([1, word_emb_dim], dtype=theano.config.floatX) # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 # for w, c in bad_dict.items(): # if c >= 100: # print "%s: %d" % (w, c) # print bad_num return X, Y, mask def generateDatasetWithoutFineTune(): """ generate data tensor when not fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size """ word_emb_dict, word_emb_dim, caseless = \ utils.loadEmbeddingsFromFile(embeddingToUse, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %s" % (word_emb_dim, caseless)) # fill data tensor (X.shape = [#data, max_length, embedding_dim], # Y.shape = [#data, max_length]) unknown_embedd = np.random.uniform(-0.01, 0.01, [1, word_emb_dim]) X_train, Y_train, mask_train = construct_tensor_not_fine_tune( word_sentences_train, label_index_sentences_train, unknown_embedd, word_emb_dict, word_emb_dim, caseless) X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune( word_sentences_dev, label_index_sentences_dev, unknown_embedd, word_emb_dict, word_emb_dim, caseless) X_test, Y_test, mask_test = construct_tensor_not_fine_tune( word_sentences_test, label_index_sentences_test, unknown_embedd, word_emb_dict, word_emb_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, \ Y_test, mask_test, None, label_alphabet, C_train, C_dev, \ C_test, char_embedd_table word_alphabet = Alphabet('word') label_alphabet = Alphabet(label_name) orth_word_alphabet = Alphabet('word_orth') # read training data logger.info("Reading data from training set...") word_sentences_train, _, word_index_sentences_train, \ label_index_sentences_train = readDataForSequenceLabeling( train_path, word_alphabet, label_alphabet, word_column, label_column) orth_word_sentences_train, orth_word_index_sentences_train = \ readDataForSequenceLabelingOrthographic(train_path, orth_word_alphabet) # if oov is "random" and do not fine tune, close word_alphabet if oov == "random" and not fine_tune: logger.info("Close word alphabet.") word_alphabet.close() orth_word_alphabet.close() # TODO: What's this for? # read dev data logger.info("Reading data from dev set...") word_sentences_dev, _, word_index_sentences_dev, \ label_index_sentences_dev = readDataForSequenceLabeling( dev_path, word_alphabet, label_alphabet, word_column, label_column) orth_word_sentences_dev, orth_word_index_sentences_dev = \ readDataForSequenceLabelingOrthographic( dev_path, orth_word_alphabet) # read test data logger.info("Reading data from test set...") word_sentences_test, _, word_index_sentences_test, \ label_index_sentences_test = readDataForSequenceLabeling( test_path, word_alphabet, label_alphabet, word_column, label_column) orth_word_sentences_test, orth_word_index_sentences_test = \ readDataForSequenceLabelingOrthographic( test_path, orth_word_alphabet) # close alphabets word_alphabet.close() label_alphabet.close() orth_word_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) logger.info("orthographic word alphabet size: %d" % (orth_word_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("maximum length of training set: %d" % max_length_train) logger.info("maximum length of dev set: %d" % max_length_dev) logger.info("maximum length of test set: %d" % max_length_test) logger.info("maximum length used for training: %d" % max_length) if fine_tune: logger.info("generating data with fine tuning...") return generateDatasetFineTune() else: logger.info("generating data without fine tuning...") return generateDatasetWithoutFineTune()
class Data: def __init__(self, input_file): self.original_data = open(input_file, 'r').readlines() self.index_data = [] self.word_alphabet = Alphabet('word') self.gloss_alphabet = Alphabet('gloss') self.entity_alphabet = Alphabet('entity') self.gaz_alphabet = Alphabet('gaz') self.label_alphabet = Alphabet('label') self.word_alphabet_size = 0 self.gloss_alphabet_size = 0 self.entity_alphabet_size = 0 self.gaz_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_gaz_hidden_dim = 50 self.HP_lstm_hidden_dim = 200 self.HP_dropout = 0.5 self.gaz_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = False self.HP_use_entity = False self.HP_use_gloss = True self.HP_use_gaz = False self.HP_gpu = True self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 self.HP_iteration = 100 # embedding hyperparameter self.word_emb_dim = 200 self.entity_emb_dim = 50 self.gloss_features = "CNN" #["CNN","LSTM"] self.gloss_emb_dim = 200 self.gloss_hidden_dim = 300 self.pretrain_word_embedding = np.array([]) self.pretrain_gaz_embedding = None self.word_embed_path = "../LOVECC/NYM.6B.200d.txt" #"NYM_200.txt" self.gaz_embed_path = None self.gaz_emb_dim = 200 self.HP_fix_gaz_emb = True def build_alphabet(self): in_lines = self.original_data for idx in range(len(in_lines)): line = json.loads(in_lines[idx]) words = line["word_context"] for word in words: self.word_alphabet.add(word) sentence_gloss = line["babel_gloss"] for word_gloss in sentence_gloss: for phrase_gloss in word_gloss: #一个词可以匹配多个词组 if "EN" in phrase_gloss: phrase_gloss_EN = phrase_gloss["EN"] final_gloss = " . ".join(phrase_gloss_EN) for de_word in final_gloss: # for definates in phrase_gloss_EN: # for de_word in definates.split(): self.gloss_alphabet.add(de_word) entitys = line["entity_context"] for entity in entitys: self.entity_alphabet.add(entity) gazs = line["babel_phase"] for gaz in gazs: for item in gaz: self.gaz_alphabet.add(item) labels = line["detection_label"] for label in labels: self.label_alphabet.add(label) print(self.label_alphabet.get_content()) self.word_alphabet_size = self.word_alphabet.size() self.gloss_alphabet_size = self.gloss_alphabet.size() self.entity_alphabet_size = self.entity_alphabet.size() self.gaz_alphabet_size = self.gaz_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() self.word_alphabet.close() self.gloss_alphabet.close() self.entity_alphabet.close() self.gaz_alphabet.close() self.label_alphabet.close() def generate_instance_Ids(self): #把输入句子变成对应的标号(Id) in_lines = self.original_data for idx in range(len(in_lines)): line = json.loads(in_lines[idx]) words = line["word_context"] words_Id = [] for word in words: words_Id.append(self.word_alphabet.get_index(word)) sentence_gloss = line["babel_gloss"] sentence_glosses_Id = [] for word_gloss in sentence_gloss: word_glosses_Id = [] for phrase_gloss in word_gloss: #一个词可以匹配多个词组 if "EN" in phrase_gloss: phrase_gloss_EN = phrase_gloss["EN"] #这是个list final_gloss = " . ".join(phrase_gloss_EN) for de_word in final_gloss: word_glosses_Id.append( self.gloss_alphabet.get_index(de_word)) sentence_glosses_Id.append(word_glosses_Id) entitys = line["entity_context"] entitys_Id = [] for entity in entitys: entitys_Id.append(self.entity_alphabet.get_index(entity)) gazs = line["babel_phase"] sentence_gazs_Id = [ ] #gazs_Id=[[[take over,take over of,...],[2,3,...]],[[legal,legal procedures,...],[1,2,...]],...,[[open the window,open the window please,...],[3,4,...]]] for gaz in gazs: word_gazs_Id = [] Ids = [] Lens = [] for item in gaz: Ids.append(self.gaz_alphabet.get_index(item)) Lens.append(len(item.split())) word_gazs_Id = [Ids, Lens] sentence_gazs_Id.append(word_gazs_Id) labels = line["detection_label"] labels_Id = [] for label in labels: labels_Id.append(self.label_alphabet.get_index(label)) self.index_data.append([ words_Id, entitys_Id, sentence_gazs_Id, sentence_glosses_Id, labels_Id ]) def load_pretrain_emb(self, embedding_path): lines = open(embedding_path, 'r', encoding="utf-8").readlines() statistic = lines[0].strip() #开头的两个统计数据:单词数,向量长度 # print(statistic) embedd_dim = int(statistic.split()[1]) embedd_dict = dict() embedd_dict["<pad>"] = [0.0 for i in range(embedd_dim)] #填充词对应的向量置为全零 # print(len(embedd_dict["<pad>"])) for line in lines[1:]: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd_dict[tokens[0]] = [float(i) for i in tokens[1:]] return embedd_dict, embedd_dim def norm2one(self, vec): if np.sum(vec) == 0: return vec root_sum_square = np.sqrt(np.sum(np.square(vec))) return vec / root_sum_square def build_pretrain_embedding(self, embedding_path, word_alphabet, embedd_dim=200, norm=True): embedd_dict = dict() if embedding_path != None: # 读取embedding字典 embedd_dict, embedd_dim = self.load_pretrain_emb(embedding_path) scale = np.sqrt(3.0 / embedd_dim) pretrain_emb = np.zeros([word_alphabet.size(), embedd_dim]) #pretrain_emb就是重排之后的embedding矩阵 perfect_match = 0 case_match = 0 not_match = 0 for word, index in word_alphabet.get_alphabet().items(): if word in embedd_dict: # print(word,index) # print(len(embedd_dict[word])) if norm: pretrain_emb[index] = self.norm2one(embedd_dict[word]) else: pretrain_emb[index] = embedd_dict[word] perfect_match += 1 elif word.lower() in embedd_dict: if norm: pretrain_emb[index] = self.norm2one( embedd_dict[word.lower()]) else: pretrain_emb[index] = embedd_dict[word.lower()] case_match += 1 else: pretrain_emb[index] = np.random.uniform( -scale, scale, [1, embedd_dim]) not_match += 1 pretrained_size = len(embedd_dict) # print("pad's embedding:",pretrain_emb[word_alphabet.get_index(",")]) print( "Embedding:\n pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s" % (pretrained_size, perfect_match, case_match, not_match, (not_match + 0.) / word_alphabet.size())) return pretrain_emb, embedd_dim #pretrain_emb就是根据alphabet的顺序重排embedding矩阵,embedd_dim是向量的纬度 def generate_embedding(self): self.pretrain_word_embedding, self.word_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.word_alphabet) self.pretrain_gloss_embedding, self.gloss_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.gloss_alphabet) self.pretrain_gaz_embedding, self.gaz_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.gaz_alphabet)
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True # self.punctuation_filter = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = True self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 30 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): addLogSectionMark("DATA SUMMARY") print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) # print(" Punctuation filter: %s" % (self.punctuation_filter)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) logger.info(" Tag scheme: %s" % (self.tagScheme)) logger.info(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) logger.info(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) logger.info(" Number normalized: %s" % (self.number_normalized)) logger.info(" Use bigram: %s" % (self.use_bigram)) logger.info(" Word alphabet size: %s" % (self.word_alphabet_size)) logger.info(" Biword alphabet size: %s" % (self.biword_alphabet_size)) logger.info(" Char alphabet size: %s" % (self.char_alphabet_size)) logger.info(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) logger.info(" Label alphabet size: %s" % (self.label_alphabet_size)) logger.info(" Word embedding size: %s" % (self.word_emb_dim)) logger.info(" Biword embedding size: %s" % (self.biword_emb_dim)) logger.info(" Char embedding size: %s" % (self.char_emb_dim)) logger.info(" Gaz embedding size: %s" % (self.gaz_emb_dim)) logger.info(" Norm word emb: %s" % (self.norm_word_emb)) logger.info(" Norm biword emb: %s" % (self.norm_biword_emb)) logger.info(" Norm gaz emb: %s" % (self.norm_gaz_emb)) logger.info(" Norm gaz dropout: %s" % (self.gaz_dropout)) logger.info(" Train instance number: %s" % (len(self.train_texts))) logger.info(" Dev instance number: %s" % (len(self.dev_texts))) logger.info(" Test instance number: %s" % (len(self.test_texts))) logger.info(" Raw instance number: %s" % (len(self.raw_texts))) logger.info(" Hyperpara iteration: %s" % (self.HP_iteration)) logger.info(" Hyperpara batch size: %s" % (self.HP_batch_size)) logger.info(" Hyperpara lr: %s" % (self.HP_lr)) logger.info(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) logger.info(" Hyperpara HP_clip: %s" % (self.HP_clip)) logger.info(" Hyperpara momentum: %s" % (self.HP_momentum)) logger.info(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) logger.info(" Hyperpara dropout: %s" % (self.HP_dropout)) logger.info(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) logger.info(" Hyperpara bilstm: %s" % (self.HP_bilstm)) logger.info(" Hyperpara GPU: %s" % (self.HP_gpu)) logger.info(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) logger.info(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) logger.info(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in xrange(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[ idx + 1].strip().split()[0].decode('utf-8') else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0].decode('utf-8') if fin: self.gaz.insert(fin, "one_source") print "Load gaz file: ", gaz_file, " total size:", self.gaz.size() else: print "Gaz file is None, load nothing" def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print "gaz alphabet size:", self.gaz_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print "build word pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print "build biword pretrain emb..." self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print "build gaz pretrain emb..." self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "sentence": self.raw_texts, self.raw_Ids = read_instance_with_gaz_text( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file)) def write_decoded_results_back(self, predict_results, name): sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) result = [] for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] print(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') for idx in range(sent_num): sent_length = len(predict_results[idx]) data = {'start': '', 'end': "", 'value': '', 'entity': ''} value = '' for idy in range(sent_length): pre_su_item = predict_results[idx][idy].split('-') if pre_su_item[0] == 'S': data['start'] = str(idy) data['end'] = str(idy + 1) data['value'] = content_list[idx][0][idy].encode('utf-8') data['entity'] = pre_su_item[1] result.append(data) data = {'start': '', 'end': "", 'value': '', 'entity': ''} if pre_su_item[0] == 'B': data['start'] = str(idy) value = value + (content_list[idx][0][idy].encode('utf-8')) if pre_su_item[0] == 'E': value = value + (content_list[idx][0][idy].encode('utf-8')) data['end'] = str(idy + 1) data['value'] = value data['entity'] = pre_su_item[1] result.append(data) data = {'start': '', 'end': "", 'value': '', 'entity': ''} value = '' if pre_su_item[0] == 'I': value = value + (content_list[idx][0][idy].encode('utf-8')) return result def write_http_data(self, output_file, inputData, name): fout = open(output_file, 'w') get_num = len(inputData) start = 0 numOfParagram = int(math.ceil(get_num / 5.0)) num_start_sentence = start num_end_sentence = numOfParagram if name == "test": num_start_sentence = 0 num_end_sentence = numOfParagram elif name == "dev": num_start_sentence = numOfParagram num_end_sentence = numOfParagram * 2 elif name == "train": num_start_sentence = numOfParagram * 2 num_end_sentence = get_num for idx in range(num_start_sentence, num_end_sentence): text = inputData[idx]["text"] entities = inputData[idx]["entities"] idText = 1 inWord = False tagReady = False entity_name = '' for Text in text: ## content_list[idx] is a list with [word, char, label] tagReady = False for entity in entities: if not inWord: if entity['start'] + 1 == entity['end'] and entity[ 'end'] == idText: fout.write( Text.encode('utf-8') + " " + "S-" + entity['entity'].encode('utf-8') + '\n') tagReady = True break if entity['start'] + 1 == idText: fout.write( Text.encode('utf-8') + " " + "B-" + entity['entity'].encode('utf-8') + '\n') tagReady = True inWord = True entity_name = entity['entity'].encode('utf-8') break else: if entity['end'] == idText: fout.write( Text.encode('utf-8') + " " + "E-" + entity_name + '\n') tagReady = True inWord = False break if not tagReady: if not inWord: fout.write(Text.encode('utf-8') + " " + "O" + '\n') else: fout.write( Text.encode('utf-8') + " " + "I-" + entity_name + '\n') idText = idText + 1 fout.write('\n') fout.close() print("Predict input data has been written into file. %s" % (output_file))
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30): """ generate data for charaters :param sentences_train: :param sentences_dev: :param sentences_test: :param max_sent_length: :return: C_train, C_dev, C_test, char_embedd_table """ def get_character_indexes(sentences): index_sentences = [] max_length = 0 for words in sentences: index_words = [] for word in words: index_chars = [] if len(word) > max_length: max_length = len(word) for char in word[:MAX_CHAR_LENGTH]: char_id = char_alphabet.get_index(char) index_chars.append(char_id) index_words.append(index_chars) index_sentences.append(index_words) return index_sentences, max_length def construct_tensor_char(index_sentences): C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32) word_end_id = char_alphabet.get_index(word_end) for i in range(len(index_sentences)): words = index_sentences[i] sent_length = len(words) for j in range(sent_length): chars = words[j] char_length = len(chars) for k in range(char_length): cid = chars[k] C[i, j, k] = cid # fill index of word end after the end of word C[i, j, char_length:] = word_end_id # Zero out C after the end of the sentence C[i, sent_length:, :] = 0 return C def build_char_embedd_table(): scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype( theano.config.floatX) return char_embedd_table char_alphabet = Alphabet('character') char_alphabet.get_index(word_end) index_sentences_train, max_char_length_train = get_character_indexes(sentences_train) index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev) index_sentences_test, max_char_length_test = get_character_indexes(sentences_test) # close character alphabet char_alphabet.close() logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test)) logger.info("Maximum character length of training set is %d" % max_char_length_train) logger.info("Maximum character length of dev set is %d" % max_char_length_dev) logger.info("Maximum character length of test set is %d" % max_char_length_test) logger.info("Maximum character length used for training is %d" % max_char_length) # fill character tensor C_train = construct_tensor_char(index_sentences_train) C_dev = construct_tensor_char(index_sentences_dev) C_test = construct_tensor_char(index_sentences_test) return C_train, C_dev, C_test, build_char_embedd_table()
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 230 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "BMES" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = False self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 50 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 # hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in xrange(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) # 获取label label = pairs[-1] # 安装出现顺序添加 self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[ idx + 1].strip().split()[0].decode('utf-8') else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False # 判断是否属于BIO,BMES,BIOES其中一�? for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: # 如果有S则为BMES或BIOES self.tagScheme = "BMES" else: # 没有则为BIO self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): # build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0].decode('utf-8') if fin: self.gaz.insert(fin, "one_source") print "Load gaz file: ", gaz_file, " total size:", self.gaz.size() else: print "Gaz file is None, load nothing" def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print "gaz alphabet size:", self.gaz_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print "build word pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_radical_pretrain_emb(self, emb_path): print "build radical pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_radical_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print "build biword pretrain emb..." self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print "build gaz pretrain emb..." self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): # content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=80): """ generate data for charaters :param sentences_train: :param sentences_dev: :param sentences_test: :param max_sent_length: :return: C_train, C_dev, C_test, char_embedd_table """ def get_character_indexes(sentences): index_sentences = [] max_length = 0 for words in sentences: index_words = [] for word in words: index_chars = [] if len(word) > max_length: max_length = len(word) for char in word[:MAX_CHAR_LENGTH]: char_id = char_alphabet.get_index(char) index_chars.append(char_id) index_words.append(index_chars) index_sentences.append(index_words) return index_sentences, max_length def construct_tensor_char(index_sentences): C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32) word_end_id = char_alphabet.get_index(word_end) for i in range(len(index_sentences)): words = index_sentences[i] sent_length = len(words) for j in range(sent_length): chars = words[j] char_length = len(chars) for k in range(char_length): cid = chars[k] C[i, j, k] = cid # fill index of word end after the end of word C[i, j, char_length:] = word_end_id # Zero out C after the end of the sentence C[i, sent_length:, :] = 0 return C def build_char_embedd_table(): logger.info('Dimension of char embedding dim is ' + str(char_embedd_dim)) scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform( -scale, scale, [char_alphabet.size(), char_embedd_dim]).astype( theano.config.floatX) return char_embedd_table char_alphabet = Alphabet('character') char_alphabet.get_index(word_end) index_sentences_train, max_char_length_train = get_character_indexes( sentences_train) index_sentences_dev, max_char_length_dev = get_character_indexes( sentences_dev) index_sentences_test, max_char_length_test = get_character_indexes( sentences_test) # close character alphabet char_alphabet.close() logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) max_char_length = min( MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test)) logger.info("Maximum character length of training set is %d" % max_char_length_train) logger.info("Maximum character length of dev set is %d" % max_char_length_dev) logger.info("Maximum character length of test set is %d" % max_char_length_test) logger.info("Maximum character length used for training is %d" % max_char_length) # fill character tensor C_train = construct_tensor_char(index_sentences_train) C_dev = construct_tensor_char(index_sentences_dev) C_test = construct_tensor_char(index_sentences_test) return C_train, C_dev, C_test, build_char_embedd_table()
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.number_normalized = True self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.feature_name2id = {} self.label_alphabet = Alphabet('label', True) self.tagScheme = "BMES" ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.word_emb_dir = None self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.pretrain_word_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.nbest = None self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_gpu = False self.HP_lr = 0.015 self.HP_l2 = 1e-8 # both self.full_data = False self.tune_wordemb = False # relation self.max_seq_len = 500 self.pad_idx = 0 self.sent_window = 3 # self.output =None self.unk_ratio = 1 self.seq_feature_size = 256 self.re_feature_name = [] self.re_feature_name2id = {} self.re_feature_alphabets = [] self.re_feature_num = len(self.re_feature_alphabets) self.re_feat_config = None self.re_feature_emb_dims = [] self.re_feature_alphabet_sizes = [] self.re_train_X = [] self.re_dev_X = [] self.re_test_X = [] self.re_train_Y = [] self.re_dev_Y = [] self.re_test_Y = [] self.patience = 10 # self.pretrained_model_dir = None def copy_alphabet(self, other): self.word_alphabet = copy.deepcopy(other.word_alphabet) self.char_alphabet = copy.deepcopy(other.char_alphabet) for feature_alphabet in other.feature_alphabets: self.feature_alphabets.append(copy.deepcopy(feature_alphabet)) self.label_alphabet = copy.deepcopy(other.label_alphabet) self.feature_name = copy.deepcopy(other.feature_name) self.feature_alphabets = copy.deepcopy(other.feature_alphabets) self.feature_num = len(self.feature_alphabets) self.feature_name2id = copy.deepcopy(other.feature_name2id) self.feature_alphabet_sizes = copy.deepcopy( other.feature_alphabet_sizes) self.feature_emb_dims = copy.deepcopy(other.feature_emb_dims) for re_feature_alphabet in other.re_feature_alphabets: self.re_feature_alphabets.append( copy.deepcopy(re_feature_alphabet)) self.re_feature_name = copy.deepcopy(other.re_feature_name) self.re_feature_name2id = copy.deepcopy(other.re_feature_name2id) self.re_feature_alphabets = copy.deepcopy(other.re_feature_alphabets) self.re_feature_num = len(self.re_feature_alphabets) self.re_feature_emb_dims = copy.deepcopy(other.re_feature_emb_dims) self.re_feature_alphabet_sizes = copy.deepcopy( other.re_feature_alphabet_sizes) def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper GPU: %s" % (self.HP_gpu)) print(" Hyper NBEST: %s" % (self.nbest)) print(" full data: %s" % (self.full_data)) print(" Tune word embeddings: %s" % (self.tune_wordemb)) print(" max sequence length: %s" % (self.max_seq_len)) print(" pad index: %s" % (self.pad_idx)) print(" patience: %s" % (self.patience)) print(" sentence window: %s" % (self.sent_window)) # print(" Output directory: %s" % (self.output)) print(" The ratio using negative instnaces 0~1: %s" % (self.unk_ratio)) print(" Size of seqeuence feature representation: %s" % (self.seq_feature_size)) print(" RE FEATURE num: %s" % (self.re_feature_num)) for idx in range(self.re_feature_num): print(" Fe: %s alphabet size: %s" % (self.re_feature_alphabets[idx].name, self.re_feature_alphabet_sizes[idx])) print(" Fe: %s embedding size: %s" % (self.re_feature_alphabets[idx].name, self.re_feature_emb_dims[idx])) print(" RE Train instance number: %s" % (len(self.re_train_Y))) print(" RE Dev instance number: %s" % (len(self.re_dev_Y))) print(" RE Test instance number: %s" % (len(self.re_test_Y))) # print(" pretrained_model_dir: %s" % (self.pretrained_model_dir)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush() def initial_feature_alphabets(self): feature_prefix = '[Cap]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 0 feature_prefix = '[POS]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 1 self.feature_num = len(self.feature_alphabets) self.feature_emb_dims = [20] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] def build_alphabet(self, documents): for doc in documents: for sentence in doc: for token in sentence: word = token['word'] if self.number_normalized: word = normalize_word(word) label = token['label'] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet self.feature_alphabets[0].add(token['cap']) self.feature_alphabets[1].add(token['pos']) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def open_alphabet(self): self.word_alphabet.open() self.char_alphabet.open() # label not open # self.label_alphabet.open() for idx in range(self.feature_num): self.feature_alphabets[idx].open() def initial_re_feature_alphabets(self): id = 0 for k, v in self.re_feat_config.items(): self.re_feature_alphabets.append(Alphabet(k)) self.re_feature_name.append(k) self.re_feature_name2id[k] = id id += 1 self.re_feature_num = len(self.re_feature_alphabets) self.re_feature_emb_dims = [20] * self.re_feature_num self.re_feature_alphabet_sizes = [0] * self.re_feature_num if self.re_feat_config: for idx in range(self.re_feature_num): if self.re_feature_name[idx] in self.re_feat_config: self.re_feature_emb_dims[idx] = self.re_feat_config[ self.re_feature_name[idx]]['emb_size'] def build_re_feature_alphabets(self, tokens, entities, relations): entity_type_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY_TYPE]']] entity_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY]']] relation_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[RELATION]']] token_num_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[TOKEN_NUM]']] entity_num_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY_NUM]']] position_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[POSITION]']] for i, doc_token in enumerate(tokens): doc_entity = entities[i] doc_relation = relations[i] sent_idx = 0 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] while sentence.shape[0] != 0: entities_in_sentence = doc_entity[( doc_entity['sent_idx'] == sent_idx)] for _, entity in entities_in_sentence.iterrows(): entity_type_alphabet.add(entity['type']) tk_idx = entity['tf_start'] while tk_idx <= entity['tf_end']: entity_alphabet.add( my_utils1.normalizeWord(sentence.iloc[ tk_idx, 0])) # assume 'text' is in 0 column tk_idx += 1 sent_idx += 1 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] for _, relation in doc_relation.iterrows(): relation_alphabet.add(relation['type']) for i in range(data.max_seq_len): token_num_alphabet.add(i) entity_num_alphabet.add(i) position_alphabet.add(i) position_alphabet.add(-i) for idx in range(self.re_feature_num): self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[ idx].size() def fix_re_alphabet(self): for alphabet in self.re_feature_alphabets: alphabet.close() def open_re_alphabet(self): for alphabet in self.re_feature_alphabets: if alphabet.name == '[RELATION]': # label not open continue alphabet.open() def build_pretrain_emb(self): if self.word_emb_dir: logging.info("Load pretrained word embedding, dir: %s" % (self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim) def generate_instance(self, name, documents): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( documents, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( documents, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( documents, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: logging.info( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_re_instance(self, name, tokens, entities, relations, names): self.fix_re_alphabet() if name == "train": self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2( tokens, entities, relations, names, self) elif name == "dev": self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2( tokens, entities, relations, names, self) elif name == "test": self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2( tokens, entities, relations, names, self) else: logging.info( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def clear_data(self): self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.re_train_X = [] self.re_dev_X = [] self.re_test_X = [] self.re_train_Y = [] self.re_dev_Y = [] self.re_test_Y = [] self.pretrain_word_embedding = None def read_config(self, config_file, opt): config = config_file_to_dict(config_file) ## read data: self.train_dir = opt.train_dir self.dev_dir = opt.dev_dir self.test_dir = opt.test_dir self.word_emb_dir = opt.word_emb_file the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = int(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) # both the_item = 'full_data' if the_item in config: self.full_data = str2bool(config[the_item]) the_item = 'tune_wordemb' if the_item in config: self.tune_wordemb = str2bool(config[the_item]) the_item = 'max_seq_len' if the_item in config: self.max_seq_len = int(config[the_item]) the_item = 'pad_idx' if the_item in config: self.pad_idx = int(config[the_item]) the_item = 'sent_window' if the_item in config: self.sent_window = int(config[the_item]) # the_item = 'output' # if the_item in config: # self.output = config[the_item] the_item = 'unk_ratio' if the_item in config: self.unk_ratio = float(config[the_item]) the_item = 'seq_feature_size' if the_item in config: self.seq_feature_size = int(config[the_item]) the_item = 're_feature' if the_item in config: self.re_feat_config = config[the_item] ## feat_config is a dict the_item = 'patience' if the_item in config: self.patience = int(config[the_item])
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7, embedding="word2Vec", embedding_path=None): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param pos_column: the column index of pos (start from 0) :param head_column: the column index of head (start from 0) :param type_column: the column index of types (start from 0) :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :return: X_train, POS_train, Head_train, Type_train, mask_train, X_dev, POS_dev, Head_dev, Type_dev, mask_dev, X_test, POS_test, Head_test, Type_test, mask_test, embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] pos_ids = pos_index_sentences[i] heads = head_sentences[i] type_ids = type_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] pid = pos_ids[j] head = heads[j] tid = type_ids[j] X[i, j] = wid POS[i, j] = pid - 1 Head[i, j] = head Type[i, j] = tid - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence POS[i, length:] = POS[i, length - 1] Head[i, length:] = Head[i, length - 1] Type[i, length:] = Type[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, POS, Head, Type, mask word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') # read training data logger.info("Reading data from training set...") word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \ word_index_sentences_train, pos_index_sentences_train, \ type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read dev data logger.info("Reading data from dev set...") word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \ word_index_sentences_dev, pos_index_sentences_dev, \ type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read test data logger.info("Reading data from test set...") word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \ word_index_sentences_test, pos_index_sentences_test, \ type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # close alphabets word_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1)) logger.info("type alphabet size: %d" % (type_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length]) X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor( word_index_sentences_train, pos_index_sentences_train, head_sentences_train, type_index_sentences_train) X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor( word_index_sentences_dev, pos_index_sentences_dev, head_sentences_dev, type_index_sentences_dev) X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor( word_index_sentences_test, pos_index_sentences_test, head_sentences_test, type_index_sentences_test) embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length) return X_train, POS_train, Head_train, Type_train, mask_train, \ X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \ X_test, POS_test, Head_test, Type_test, mask_test, \ embedd_table, word_alphabet, pos_alphabet, type_alphabet, \ C_train, C_dev, C_test, char_embedd_table
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" self.char_features = "LSTM" ## "LSTM"/"CNN" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_average_batch_loss = False self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 50 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyper iteration: %s" % (self.HP_iteration)) print(" Hyper batch size: %s" % (self.HP_batch_size)) print(" Hyper average batch: %s" % (self.HP_average_batch_loss)) print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print(" Hyper use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def extend_word_char_alphabet(self, input_file_list): old_word_size = self.word_alphabet_size old_char_size = self.char_alphabet_size for input_file in input_file_list: in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() print("Extend word/char alphabet finished!") print(" old word:%s -> new word:%s" % (old_word_size, self.word_alphabet_size)) print(" old char:%s -> new char:%s" % (old_char_size, self.char_alphabet_size)) for input_file in input_file_list: print(" from file:%s" % (input_file)) def build_alphabet(self, input_file): in_lines_string = open(input_file + ".string.txt", 'r').readlines() in_lines_label = open(input_file + ".label.txt", 'r').readlines() for line_string, line_label in zip(in_lines_string, in_lines_label): print(line_label) print(line_string) line_label = line_label[:-1].split(',') line_string = line_string[:-1] assert len(line_label) == len(line_string) for i in range(len(line_label)): self.label_alphabet.add(line_label[i]) self.word_alphabet.add(line_string[i]) self.char_alphabet.add("*") self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path): self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_char_pretrain_emb(self, emb_path): self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( emb_path, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = {0: Alphabet('label', True)} self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_feature_embeddings = [] #Added for pretraining self.PRETRAINED_ALL = "all" self.PRETRAINED_LSTMS = "lstms" self.pretrained_model = None self.pretrained_part = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 #self.label_alphabet_size = 0 self.label_alphabet_sizes = {0: 0} self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_feature_default_size = 20 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 #D: The number of tasks to be solved self.HP_tasks = 1 self.HP_main_tasks = self.HP_tasks self.HP_tasks_weights = [1] self.optimize_with_evalb = False self.optimize_with_las = False self.offset = False self.choice_of_best_model = "avg" self.language = "English" # self.HP_tasks_inputs = [self.LSTMOUT] #Policy Gradient self.No_samples = 8 self.pg_variance_reduce = True self.variance_reduce_burn_in = 999 self.pg_valsteps = 1000 self.entropy_regularisation = True self.entropy_reg_coeff = 0.01 #Hyper-parameters for disjoint training self.train_task_ids = [] self.dev_task_ids = [] self.test_task_ids = [] self.raw_task_ids = [] self.disjoint = True self.datasets = {} self.tasks_metrics = {} self.HP_tasks_weight_decays = [0] def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) for idtask in self.label_alphabet: print(" Label alphabet size for task %s: %s" % (idtask, self.label_alphabet_sizes[idtask])) #print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Dset file directory: %s" % (self.dset_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Pretrained model : %s" % (self.pretrained_model)) print(" Pretrained part : %s" % (self.pretrained_part)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char extractor: %s" % (self.char_feature_extractor)) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) print(" " + "++" * 20) print(" Training:") print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print(" Hyper number of tasks: %s" % (self.HP_tasks)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush() def initial_feature_alphabets(self): for l in open(self.train_dir, 'r').readlines(): if not l.startswith("#") and not l.startswith("-BOS-"): items = l.strip("\n").split() break total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [self.HP_feature_default_size ] * self.feature_num #self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm'] def build_alphabet(self, input_file): sample_corpus = None in_lines = open(input_file, 'r').readlines() for line in in_lines: if line.upper().startswith( TREEBANK_LINE ): #Check the treebank this sentence comes from sample_corpus = "[" + line.upper().replace(TREEBANK_LINE, "").strip() + "]" elif len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] if self.HP_tasks > 1 or not self.disjoint: #self.task_config[sample_corpus]["nb_tasks"] > 1: label = parse_multitask_label(label) else: label = [label] if len(label) != len( self.label_alphabet) and not self.disjoint: raise ValueError( "The number of tasks and the number of labels in the output column do not match" ) init_label_alp_index = 0 if not self.disjoint else self.task_config[ sample_corpus]["idstask"] for idtask, l in enumerate(label, init_label_alp_index): #for idtask, l in enumerate(label): self.label_alphabet[idtask].add(l) self.word_alphabet.add(word) for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() for idtask in self.label_alphabet: self.label_alphabet_sizes[idtask] = self.label_alphabet[ idtask].size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() for idtask in self.label_alphabet: startS = False startB = False for label, _ in self.label_alphabet[idtask].iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() for idtask in self.label_alphabet: self.label_alphabet[idtask].close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name, indexes=None): fout = open(self.decode_dir, 'w') content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) for task_predict_results in predict_results: sent_num = len(task_predict_results) assert (sent_num == len(content_list)) for idx in range(sent_num): if indexes is not None and idx not in indexes: continue sent_length = len( predict_results[0] [idx]) #Index 0 to know the length of the input sentence for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] inputs = [] for id_input in range(len(content_list[idx]) - 2): if content_list[idx][id_input][0] != []: if type(content_list[idx][id_input][idy]) == type([]): for feature in content_list[idx][id_input][idy]: inputs.append(feature.encode('utf-8')) else: inputs.append(content_list[idx][id_input] [idy].encode('utf-8')) outputs = [] for task in predict_results: outputs.append(task[idx][idy]) fout.write("\t".join(inputs) + "\t" + "{}".join(outputs) + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) for idtask_predict_results, task_predict_results in enumerate( predict_results): sent_num = len(task_predict_results) assert (sent_num == len(content_list)) for idx in range(sent_num): score_string = "# " for idtask_predict_results, task_predict_results in enumerate( predict_results): sent_length = len(task_predict_results[idx][0]) nbest = len(task_predict_results[0]) #Printing the probabilities for idz in range(nbest): score_string += format( pred_scores[idtask_predict_results][idx][idz], '.4f') + " " fout.write(score_string.strip() + "\t") fout.write("\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + "\t" for ifeat in range(len(content_list[idx][1][idy])): label_string += content_list[idx][1][idy][ifeat].encode( 'utf-8') + "\t" for idtask_predict_results, task_predict_results in enumerate( predict_results): for idz in range(nbest): label_string += task_predict_results[idx][idz][ idy] + "," label_string = label_string.strip().strip(",") + "{}" fout.write(label_string) fout.write('\n') fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'dset_dir' if the_item in config: self.dset_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_feature_extractor = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict the_item = 'feature_default_size' if the_item in config: self.HP_feature_default_size = int(config[the_item]) ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) #Hyperparameters for auxiliary tasks over the same treebank the_item = 'disjoint' if the_item in config: self.disjoint = str2bool(config[the_item]) if not self.disjoint: the_item = 'tasks' if the_item in config: self.HP_tasks = int(config[the_item]) if self.HP_tasks > 1: self.label_alphabet = { idtask: Alphabet('label', True) for idtask in range(self.HP_tasks) } self.label_alphabet_sizes = { idtask: self.label_alphabet[idtask].size() for idtask in range(self.HP_tasks) } the_item = "main_tasks" if the_item in config: self.HP_main_tasks = int(config[the_item]) print self.HP_main_tasks, self.HP_tasks if self.HP_main_tasks > self.HP_tasks: raise ValueError( "HP_main_tasks cannot be greater than HP_tasks") the_item = 'tasks_weights' if the_item in config: self.HP_tasks_weights = map(float, config[the_item].split("|")) else: #Hyperparameters for auxiliary tasks over a different treebank the_item = 'dataset' if the_item in config: self.task_config = config[the_item] ## feat_config is a dict self.HP_tasks = sum([ self.task_config[idtask]["nb_tasks"] for idtask in self.task_config ]) self.HP_main_tasks = sum([ self.task_config[idtask]["nb_tasks"] for idtask in self.task_config if self.task_config[idtask]["main"] ]) self.label_alphabet = { idtask: Alphabet('label', True) for idtask in range(self.HP_tasks) } self.label_alphabet_sizes = { idtask: self.label_alphabet[idtask].size() for idtask in range(self.HP_tasks) } self.HP_tasks_weights = [] self.HP_tasks_weight_decays = [] for idtask in self.task_config: for weight in self.task_config[idtask]["weight"]: self.HP_tasks_weights.append(weight) if "weight_decay" in self.task_config[idtask]: for weight_decay in self.task_config[idtask][ "weight_decay"]: self.HP_tasks_weight_decays.append(weight_decay) else: for j in range(self.task_config[idtask]["nb_tasks"]): self.HP_tasks_weight_decays.append(0) self.dataset_ids = { treebank: range( self.task_config[treebank]["idstask"], self.task_config[treebank]["idstask"] + self.task_config[treebank]["nb_tasks"]) for id, treebank in enumerate(self.task_config) } self.ignore_after_epoch = { treebank: self.task_config[treebank]["ignore_after_epoch"] if "ignore_after_epoch" in self.task_config[treebank] else self.HP_iteration + 1 for treebank in self.task_config } self.inv_dataset_ids = {} for tb in self.dataset_ids: for subtask in self.dataset_ids[tb]: self.inv_dataset_ids[subtask] = tb self.task_metric = {} for dataset in self.task_config: for i in range( self.task_config[dataset]["idstask"], self.task_config[dataset]["idstask"] + self.task_config[dataset]["nb_tasks"]): if "metric" in self.task_config[dataset]: self.task_metric[i] = self.task_config[dataset][ "metric"] the_item = 'evaluate' if the_item in config: self.evaluate = config[the_item] the_item = "gold_dev_trees" if the_item in config: self.gold_dev_trees = config[the_item] the_item = "gold_dev_dep" if the_item in config: self.gold_dev_dep = config[the_item] the_item = "combine_dependency_offset" if the_item in config: self.offset = str2bool(config[the_item]) the_item = "pretrained_model" if the_item in config: self.pretrained_model = config[the_item] the_item = "pretrained_part" if the_item in config: if config[the_item].lower() not in [ self.PRETRAINED_ALL, self.PRETRAINED_LSTMS ]: raise ValueError( "Invalidad value for pretrained_part (must be 'all' or 'lstms' " ) self.pretrained_part = config[the_item] the_item = "optimize_with_las" if the_item in config: self.optimize_with_las = str2bool(config[the_item]) the_item = "gold_train_trees" if the_item in config: self.gold_train_trees = config[the_item]
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.feature_name2id = {} self.label_alphabet = Alphabet('label',True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.model_dir = None ## model save file self.word_emb_dir = None self.char_emb_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 # both self.full_data = False self.tune_wordemb = False # relation self.pretrain = None self.max_seq_len = 500 self.pad_idx = 1 self.sent_window = 3 self.output =None self.unk_ratio=1 self.seq_feature_size=256 self.max_epoch = 100 self.feature_extractor=None self.re_feature_name = [] self.re_feature_name2id = {} self.re_feature_alphabets = [] self.re_feature_num = len(self.re_feature_alphabets) self.re_feat_config = None self.re_train_X = [] self.re_dev_X = [] self.re_test_X = [] self.re_train_Y = [] self.re_dev_Y = [] self.re_test_Y = [] def show_data_summary(self): print("++"*50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s"%(self.tagScheme)) print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) print(" Number normalized: %s"%(self.number_normalized)) print(" Word alphabet size: %s"%(self.word_alphabet_size)) print(" Char alphabet size: %s"%(self.char_alphabet_size)) print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding dir: %s"%(self.word_emb_dir)) print(" Char embedding dir: %s"%(self.char_emb_dir)) print(" Word embedding size: %s"%(self.word_emb_dim)) print(" Char embedding size: %s"%(self.char_emb_dim)) print(" Norm word emb: %s"%(self.norm_word_emb)) print(" Norm char emb: %s"%(self.norm_char_emb)) print(" Train file directory: %s"%(self.train_dir)) print(" Dev file directory: %s"%(self.dev_dir)) print(" Test file directory: %s"%(self.test_dir)) print(" Model file directory: %s"%(self.model_dir)) print(" Train instance number: %s"%(len(self.train_texts))) print(" Dev instance number: %s"%(len(self.dev_texts))) print(" Test instance number: %s"%(len(self.test_texts))) print(" FEATURE num: %s"%(self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print(" Fe: %s embedding dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print(" Fe: %s embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) # for k, v in self.feat_config.items(): # print(" Feature: %s, size %s, norm %s, dir %s"%(k, v['emb_size'], v['emb_norm'], v['emb_dir'])) print(" "+"++"*20) print(" Model Network:") print(" Model use_crf: %s"%(self.use_crf)) print(" Model word extractor: %s"%(self.word_feature_extractor)) print(" Model use_char: %s"%(self.use_char)) if self.use_char: print(" Model char extractor: %s"%(self.char_feature_extractor)) print(" Model char_hidden_dim: %s"%(self.HP_char_hidden_dim)) print(" "+"++"*20) print(" Training:") print(" Optimizer: %s"%(self.optimizer)) print(" Iteration: %s"%(self.HP_iteration)) print(" BatchSize: %s"%(self.HP_batch_size)) print(" Average batch loss: %s"%(self.average_batch_loss)) print(" "+"++"*20) print(" Hyperparameters:") print(" Hyper lr: %s"%(self.HP_lr)) print(" Hyper lr_decay: %s"%(self.HP_lr_decay)) print(" Hyper HP_clip: %s"%(self.HP_clip)) print(" Hyper momentum: %s"%(self.HP_momentum)) print(" Hyper l2: %s"%(self.HP_l2)) print(" Hyper hidden_dim: %s"%(self.HP_hidden_dim)) print(" Hyper dropout: %s"%(self.HP_dropout)) print(" Hyper lstm_layer: %s"%(self.HP_lstm_layer)) print(" Hyper bilstm: %s"%(self.HP_bilstm)) print(" Hyper GPU: %s"%(self.HP_gpu)) print(" Hyper NBEST: %s"%(self.nbest)) print(" " + "++" * 20) print(" Both:") print(" full data: %s" % (self.full_data)) print(" Tune word embeddings: %s" % (self.tune_wordemb)) print(" "+"++"*20) print(" Relation:") print(" Pretrain directory: %s" % (self.pretrain)) print(" max sequence length: %s" % (self.max_seq_len)) print(" pad index: %s" % (self.pad_idx)) print(" sentence window: %s" % (self.sent_window)) print(" Output directory: %s" % (self.output)) print(" The ratio using negative instnaces 0~1: %s" % (self.unk_ratio)) print(" Size of seqeuence feature representation: %s" % (self.seq_feature_size)) print(" Iteration for relation training: %s" % (self.max_epoch)) print(" feature_extractor: %s" % (self.feature_extractor)) print(" RE FEATURE num: %s"%(self.re_feature_num)) for idx in range(self.re_feature_num): print(" Fe: %s alphabet size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_alphabet_sizes[idx])) print(" Fe: %s embedding dir: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dirs[idx])) print(" Fe: %s embedding size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dims[idx])) print(" Fe: %s norm emb: %s"%(self.re_feature_alphabets[idx].name, self.re_norm_feature_embs[idx])) print(" RE Train instance number: %s"%(len(self.re_train_Y))) print(" RE Dev instance number: %s"%(len(self.re_dev_Y))) print(" RE Test instance number: %s"%(len(self.re_test_Y))) print("DATA SUMMARY END.") print("++"*50) sys.stdout.flush() def initial_feature_alphabets(self, input_file): items = open(input_file,'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: id = 0 for idx in range(1, total_column-1): feature_prefix = items[idx].split(']',1)[0]+"]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = id id += 1 print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None]*self.feature_num self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None]*self.feature_num self.norm_feature_embs = [False]*self.feature_num self.feature_alphabet_sizes = [0]*self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx+1].split(']',1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def initial_re_feature_alphabets(self): id = 0 for k, v in self.re_feat_config.items(): self.re_feature_alphabets.append(Alphabet(k)) self.re_feature_name.append(k) self.re_feature_name2id[k] = id id += 1 self.re_feature_num = len(self.re_feature_alphabets) self.re_pretrain_feature_embeddings = [None]*self.re_feature_num self.re_feature_emb_dims = [20]*self.re_feature_num self.re_feature_emb_dirs = [None]*self.re_feature_num self.re_norm_feature_embs = [False]*self.re_feature_num self.re_feature_alphabet_sizes = [0]*self.re_feature_num if self.re_feat_config: for idx in range(self.re_feature_num): if self.re_feature_name[idx] in self.re_feat_config: self.re_feature_emb_dims[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_size'] self.re_feature_emb_dirs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_dir'] self.re_norm_feature_embs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_norm'] def build_re_feature_alphabets(self, tokens, entities, relations): entity_type_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_TYPE]']] entity_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY]']] relation_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[RELATION]']] token_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[TOKEN_NUM]']] entity_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_NUM]']] position_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[POSITION]']] for i, doc_token in enumerate(tokens): doc_entity = entities[i] doc_relation = relations[i] sent_idx = 0 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] while sentence.shape[0] != 0: entities_in_sentence = doc_entity[(doc_entity['sent_idx'] == sent_idx)] for _, entity in entities_in_sentence.iterrows(): entity_type_alphabet.add(entity['type']) tk_idx = entity['tf_start'] while tk_idx <= entity['tf_end']: entity_alphabet.add( my_utils1.normalizeWord(sentence.iloc[tk_idx, 0])) # assume 'text' is in 0 column tk_idx += 1 sent_idx += 1 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] for _, relation in doc_relation.iterrows(): relation_alphabet.add(relation['type']) for i in range(data.max_seq_len): token_num_alphabet.add(i) entity_num_alphabet.add(i) position_alphabet.add(i) position_alphabet.add(-i) for idx in range(self.re_feature_num): self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[idx].size() def fix_re_alphabet(self): for alphabet in self.re_feature_alphabets: alphabet.close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def build_re_pretrain_emb(self): for idx in range(self.re_feature_num): if self.re_feature_emb_dirs[idx]: print("Load pretrained re feature %s embedding:, norm: %s, dir: %s" % (self.re_feature_name[idx], self.re_norm_feature_embs[idx], self.re_feature_emb_dirs[idx])) self.re_pretrain_feature_embeddings[idx], self.re_feature_emb_dims[idx] = build_pretrain_embedding( self.re_feature_emb_dirs[idx], self.re_feature_alphabets[idx], self.re_feature_emb_dims[idx], self.re_norm_feature_embs[idx]) def generate_instance(self, name, input_file): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def generate_re_instance(self, name, tokens, entities, relations, names): self.fix_re_alphabet() if name == "train": self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self) elif name == "dev": self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self) elif name == "test": self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def load(self,data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self,save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def read_config(self,config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_feature_extractor = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = int(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) # both the_item = 'full_data' if the_item in config: self.full_data = str2bool(config[the_item]) the_item = 'tune_wordemb' if the_item in config: self.tune_wordemb = str2bool(config[the_item]) # relation the_item = 'pretrain' if the_item in config: self.pretrain = config[the_item] the_item = 'max_seq_len' if the_item in config: self.max_seq_len = int(config[the_item]) the_item = 'pad_idx' if the_item in config: self.pad_idx = int(config[the_item]) the_item = 'sent_window' if the_item in config: self.sent_window = int(config[the_item]) the_item = 'output' if the_item in config: self.output = config[the_item] the_item = 'unk_ratio' if the_item in config: self.unk_ratio = float(config[the_item]) the_item = 'seq_feature_size' if the_item in config: self.seq_feature_size = int(config[the_item]) the_item = 'max_epoch' if the_item in config: self.max_epoch = int(config[the_item]) the_item = 'feature_extractor' if the_item in config: self.feature_extractor = config[the_item] the_item = 're_feature' if the_item in config: self.re_feat_config = config[the_item] ## feat_config is a dict
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label',True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 def show_data_summary(self): print("++"*50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s"%(self.tagScheme)) print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) print(" Number normalized: %s"%(self.number_normalized)) print(" Word alphabet size: %s"%(self.word_alphabet_size)) print(" Char alphabet size: %s"%(self.char_alphabet_size)) print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding dir: %s"%(self.word_emb_dir)) print(" Char embedding dir: %s"%(self.char_emb_dir)) print(" Word embedding size: %s"%(self.word_emb_dim)) print(" Char embedding size: %s"%(self.char_emb_dim)) print(" Norm word emb: %s"%(self.norm_word_emb)) print(" Norm char emb: %s"%(self.norm_char_emb)) print(" Train file directory: %s"%(self.train_dir)) print(" Dev file directory: %s"%(self.dev_dir)) print(" Test file directory: %s"%(self.test_dir)) print(" Raw file directory: %s"%(self.raw_dir)) print(" Dset file directory: %s"%(self.dset_dir)) print(" Model file directory: %s"%(self.model_dir)) print(" Loadmodel directory: %s"%(self.load_model_dir)) print(" Decode file directory: %s"%(self.decode_dir)) print(" Train instance number: %s"%(len(self.train_texts))) print(" Dev instance number: %s"%(len(self.dev_texts))) print(" Test instance number: %s"%(len(self.test_texts))) print(" Raw instance number: %s"%(len(self.raw_texts))) print(" FEATURE num: %s"%(self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print(" Fe: %s embedding dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print(" Fe: %s embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" "+"++"*20) print(" Model Network:") print(" Model use_crf: %s"%(self.use_crf)) print(" Model word extractor: %s"%(self.word_feature_extractor)) print(" Model use_char: %s"%(self.use_char)) if self.use_char: print(" Model char extractor: %s"%(self.char_feature_extractor)) print(" Model char_hidden_dim: %s"%(self.HP_char_hidden_dim)) print(" "+"++"*20) print(" Training:") print(" Optimizer: %s"%(self.optimizer)) print(" Iteration: %s"%(self.HP_iteration)) print(" BatchSize: %s"%(self.HP_batch_size)) print(" Average batch loss: %s"%(self.average_batch_loss)) print(" "+"++"*20) print(" Hyperparameters:") print(" Hyper lr: %s"%(self.HP_lr)) print(" Hyper lr_decay: %s"%(self.HP_lr_decay)) print(" Hyper HP_clip: %s"%(self.HP_clip)) print(" Hyper momentum: %s"%(self.HP_momentum)) print(" Hyper l2: %s"%(self.HP_l2)) print(" Hyper hidden_dim: %s"%(self.HP_hidden_dim)) print(" Hyper dropout: %s"%(self.HP_dropout)) print(" Hyper lstm_layer: %s"%(self.HP_lstm_layer)) print(" Hyper bilstm: %s"%(self.HP_bilstm)) print(" Hyper GPU: %s"%(self.HP_gpu)) print("DATA SUMMARY END.") print("++"*50) sys.stdout.flush() def initial_feature_alphabets(self): items = open(self.train_dir,'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: for idx in range(1, total_column-1): feature_prefix = items[idx].split(']',1)[0]+"]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None]*self.feature_num self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None]*self.feature_num self.norm_feature_embs = [False]*self.feature_num self.feature_alphabet_sizes = [0]*self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx+1].split(']',1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance(self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance(self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance(self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance(self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir,'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") assert(sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s"%(name, self.decode_dir)) def load(self,data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self,save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir,'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") assert(sent_num == len(content_list)) assert(sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f')+" " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy]+" " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s"%(name,nbest, self.decode_dir)) def read_config(self,config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'dset_dir' if the_item in config: self.dset_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_feature_extractor = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item])
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 512 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.word_alphabet.add(START) self.word_alphabet.add(UNKNOWN) self.char_alphabet.add(START) self.char_alphabet.add(UNKNOWN) self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label') self.tagScheme = "NoSeg" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.word_emb_dim = 50 self.pretrain_word_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_batch_size = 10 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print("DATA SUMMARY END.") sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path, norm=False): self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, norm) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_WORD_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_WORD_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_WORD_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name))