def test_crf_marginals(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) y_pred_marginals = crf.predict_marginals([xseq]) assert len(y_pred_marginals) == 1 marginals = y_pred_marginals[0] assert len(marginals) == len(yseq) labels = crf.tagger_.labels() for m in marginals: assert isinstance(m, dict) assert set(m.keys()) == set(labels) assert abs(sum(m.values()) - 1.0) < 1e-6
def test_crf_marginals(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) y_pred_marginals = crf.predict_marginals([xseq]) assert len(y_pred_marginals) == 1 marginals = y_pred_marginals[0] assert len(marginals) == len(yseq) labels = crf.tagger_.labels() for m in marginals: assert isinstance(m, dict) assert set(m.keys()) == set(labels) assert abs(sum(m.values()) - 1.0) < 1e-6
class ConditionalRandomFields(Tagger): """A Conditional Random Fields model.""" @staticmethod def _predict_proba(X): del X pass @staticmethod def load(model_path): del model_path pass def fit(self, X, y): self._clf.fit(X, y) return self def set_params(self, **parameters): self._clf = CRF() self._clf.set_params(**parameters) return self def get_params(self, deep=True): return self._clf.get_params() def predict(self, X, dynamic_resource=None): return self._clf.predict(X) def predict_proba(self, examples, config, resources): """ Args: examples (list of mindmeld.core.Query): a list of queries to predict on config (ModelConfig): The ModelConfig which may contain information used for feature extraction resources (dict): Resources which may be used for this model's feature extraction Returns: list of tuples of (mindmeld.core.QueryEntity): a list of predicted labels \ with confidence scores """ X, _, _ = self.extract_features(examples, config, resources) seq = self._clf.predict(X) marginals_dict = self._clf.predict_marginals(X) marginal_tuples = [] for query_index, query_seq in enumerate(seq): query_marginal_tuples = [] for i, tag in enumerate(query_seq): query_marginal_tuples.append( [tag, marginals_dict[query_index][i][tag]]) marginal_tuples.append(query_marginal_tuples) return marginal_tuples def extract_features(self, examples, config, resources, y=None, fit=True): """Transforms a list of examples into a feature matrix. Args: examples (list of mindmeld.core.Query): a list of queries config (ModelConfig): The ModelConfig which may contain information used for feature extraction resources (dict): Resources which may be used for this model's feature extraction Returns: (list of list of str): features in CRF suite format """ # Extract features and classes feats = [] for _, example in enumerate(examples): feats.append( self.extract_example_features(example, config, resources)) X = self._preprocess_data(feats, fit) return X, y, None @staticmethod def extract_example_features(example, config, resources): """Extracts feature dicts for each token in an example. Args: example (mindmeld.core.Query): A query. config (ModelConfig): The ModelConfig which may contain information used for feature \ extraction. resources (dict): Resources which may be used for this model's feature extraction. Returns: list[dict]: Features. """ return extract_sequence_features(example, config.example_type, config.features, resources) def _preprocess_data(self, X, fit=False): """Converts data into formats of CRF suite. Args: X (list of dict): features of an example fit (bool, optional): True if processing data at fit time, false for predict time. Returns: (list of list of str): features in CRF suite format """ if fit: self._feat_binner.fit(X) new_X = [] for feat_seq in self._feat_binner.transform(X): feat_list = [] for feature in feat_seq: temp_list = [] for feat_type in sorted(feature.keys()): temp_list.append("{}={}".format(feat_type, str(feature[feat_type]))) feat_list.append(temp_list) new_X.append(feat_list) return new_X def setup_model(self, config): self._feat_binner = FeatureBinner()
class SeqModel(): def __init__(self, data): print("build batched lstmcrf...") self.label_alphabet=data.label_alphabet self.word_alphabet=data.word_alphabet self.crf = CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_states=False, all_possible_transitions=True ) self.examiner = Examiner(data) self.useExaminer = False self.loss_function = nn.NLLLoss() self.topk=5 self.X_train=[] self.Y_train=[] self.pos_mask_list=[] self.instances=[] self.scores_refs=[] self.pos_mask=None self.tag_size=data.label_alphabet_size def masked_label(self,pos_mask,mask,batch_label,tag_seq): """ generate masked label sequence """ batch_label=batch_label.mul(1-pos_mask) # print(tag_seq) # print(pos_mask) tag_seq=Variable(tag_seq).mul(pos_mask) return batch_label+tag_seq def ner(sentence): sentence_features = [self.features(sentence, index) for index in range(len(sentence))] return list(zip(sentence, model.predict([sentence_features])[0])) def rand_mask(self,word_inputs,mask): """ generate random mask """ batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_word = batch_size * seq_len # if self.full==True: # return Variable(torch.zeros(batch_size,seq_len).long(),requires_grad=False) rand_vec=Variable(torch.rand(batch_size,seq_len),requires_grad=False) rand_vec=mask.cpu().float() * rand_vec if seq_len>=self.topk: topk, indices = rand_vec.topk(self.topk,dim=1) else: topk, indices = rand_vec.topk(seq_len,dim=1) pos_mask=Variable(torch.ones(batch_size, seq_len)) pos_mask=pos_mask.scatter(1,indices,0).long() return pos_mask.cuda() def sent2features(self,sent): return [self.features(sent, i) for i in range(len(sent))] def sent2labels(sent): return [label for token, postag, label in sent] def sent2tokens(sent): return [token for token, postag, label in sent] def tensor_to_sequence(self, _alphabet, word_inputs, label=True): #seq_len = word_inputs.size(1) if label==True: return [[_alphabet.get_instance(x) for x in word_inputs[0]]] else: #print(word_inputs) return [self.sent2features([_alphabet.get_instance(x) for x in word_inputs[0]])] def sequence_to_tensor(self,_alphabet, word_inputs): return torch.LongTensor([[_alphabet.get_index(x) for x in word_inputs[0]]]) def pos_selection(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask,t=None, pos_mask=None): """ a function to directly get reward while generating the new label sequence """ batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) #get score and tag_seq tag_seq = self.sequence_to_tensor(self.label_alphabet,self.crf.predict(self.tensor_to_sequence(self.word_alphabet,word_inputs,label=False))) distributions=self.crf.predict_marginals(self.tensor_to_sequence(self.word_alphabet,word_inputs,label=False)) tag_seq = tag_seq.cuda() tag_prob=Variable(torch.zeros(1,word_seq_lengths[0], self.tag_size)) for j,key in enumerate(self.label_alphabet.instances): for i in range(word_seq_lengths[0]): if key in distributions[0][i]: tag_prob[0,i,j]=distributions[0][i][key] else: tag_prob[0,i,j]=0.0 if t!=None: t_mask=self.pos_mask_list[t] indices, pos_mask, scores_ref,score = self.examiner.neg_log_likelihood_loss(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover,batch_label,tag_seq,tag_prob,mask*(1-t_mask).byte(),self.crf) else: indices, pos_mask, scores_ref,score = self.examiner.neg_log_likelihood_loss(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover,batch_label,tag_seq,tag_prob,mask,self.crf) self.pos_mask=pos_mask new_batch_label=self.masked_label(pos_mask,mask,batch_label, tag_seq) return new_batch_label,tag_seq,tag_prob,pos_mask,score,indices,scores_ref def add_instance(self,word_inputs,batch_label,pos_mask): """ add instances to training dataset """ batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) self.X_train.append(self.tensor_to_sequence(self.word_alphabet,word_inputs,label=False)[0]) self.Y_train.append(self.tensor_to_sequence(self.label_alphabet,batch_label)[0]) if pos_mask is None: self.pos_mask_list.append(Variable(torch.zeros(batch_size, seq_len).long())) else: self.pos_mask_list.append(pos_mask) def reinforcement_reward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label,tag_seq,tag_prob, mask,mode): """ a function to directly get reward instead of generating the new label sequence """ batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) tag_seq = self.sequence_to_tensor(self.label_alphabet,self.crf.predict(self.tensor_to_sequence(self.word_alphabet,word_inputs,label=False))) distributions=self.crf.predict_marginals(self.tensor_to_sequence(self.word_alphabet,word_inputs,label=False)) tag_seq = tag_seq.cuda() tag_prob=Variable(torch.zeros(1,word_seq_lengths[0], self.tag_size)) for j,key in enumerate(self.label_alphabet.instances): for i in range(word_seq_lengths[0]): if key in distributions[0][i]: tag_prob[0,i,j]=distributions[0][i][key] else: tag_prob[0,i,j]=0.0 indices,pos_mask,scores_ref,full_loss = self.examiner.neg_log_likelihood_loss(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover,batch_label,tag_seq,tag_prob,mask,self.crf) ''' indices: the selected positions as indices pos_mask: the selected positions as mask vector ''' if mode=="supervised_partial": return pos_mask,(full_loss*(1-pos_mask.float())).sum() elif mode=="supervised_full": return pos_mask,full_loss.sum() else: return pos_mask,scores_ref # def pop_instance(self,x): # self.X_train.pop(0) # self.Y_train.pop(0) # def reevaluate_instance(self, mask): # for i in range(len(self.X_train)): # #X_train[i] # tag_seq = self.sequence_to_tensor(self.label_alphabet,self.crf.predict([self.X_train[i]])) # pos_mask=self.pos_mask_list[i] # batch_label=self.masked_label(pos_mask,mask,Variable(self.sequence_to_tensor(self.label_alphabet,[self.Y_train[i]])), tag_seq) # self.Y_train[i]=self.tensor_to_sequence(self.label_alphabet,batch_label)[0] def features(self,sent, i): # obtain some overall information of the point name string num_part = 4 len_string = len(sent) mod = len_string % num_part part_size = int(math.floor(len_string/num_part)) # determine which part the current character belongs to # larger part will be at the beginning if the whole sequence can't be divided evenly size_list = [] mod_count = 0 for j in range(num_part): if mod_count < mod: size_list.append(part_size+1) mod_count += 1 else: size_list.append(part_size) # for current character part_cumulative = [0]*num_part for j in range(num_part): if j > 0: part_cumulative[j] = part_cumulative[j-1] + size_list[j] else: part_cumulative[j] = size_list[j] - 1 # indices start from 0 part_indicator = [0]*num_part for j in range(num_part): if part_cumulative[j] >= i: part_indicator[j] = 1 break word = sent[i][0] if word.isdigit(): itself = 'NUM' else: itself = word features = { 'word': itself, 'part0': part_indicator[0] == 1, 'part1': part_indicator[1] == 1, 'part2': part_indicator[2] == 1, 'part3': part_indicator[3] == 1, } # for previous character if i > 0: part_indicator = [0] * num_part for j in range(num_part): if part_cumulative[j] >= i-1: part_indicator[j] = 1 break word1 = sent[i-1] if word1.isdigit(): itself1 = 'NUM' else: itself1 = word1 features.update({ '-1:word': itself1, '-1:part0': part_indicator[0] == 1, '-1:part1': part_indicator[1] == 1, '-1:part2': part_indicator[2] == 1, '-1:part3': part_indicator[3] == 1, }) else: features['BOS'] = True # for next character if i < len(sent)-1: part_indicator = [0] * num_part for j in range(num_part): if part_cumulative[j] >= i + 1: part_indicator[j] = 1 break word1 = sent[i+1] if word1.isdigit(): itself1 = 'NUM' else: itself1 = word1 features.update({ '+1:word': itself1, '+1:part0': part_indicator[0] == 1, '+1:part1': part_indicator[1] == 1, '+1:part2': part_indicator[2] == 1, '+1:part3': part_indicator[3] == 1, }) else: features['EOS'] = True return features def train(self): self.crf.fit(self.X_train, self.Y_train) return def sample_train(self,left,right): self.crf.fit(self.X_train[left:right], self.Y_train[left:right]) return def clear(self): self.X_train=[] self.Y_train=[] self.pos_mask_list=[] return def test(self,word_inputs): tag_seq = self.sequence_to_tensor(self.label_alphabet,self.crf.predict(self.tensor_to_sequence(self.word_alphabet,word_inputs,label=False))) return Variable(tag_seq)
class EntityExtractor: def __init__( self, hyper_params: Dict[str, float] = None, model_path: str = None, ): if model_path: self.load_model(model_path=model_path) else: algorithm = (hyper_params["algorithm"] if hyper_params and "algorithm" in hyper_params else "lbfgs") c1 = hyper_params[ "c1"] if hyper_params and "c1" in hyper_params else 0.1 c2 = hyper_params[ "c2"] if hyper_params and "c2" in hyper_params else 0.1 max_iters = (hyper_params["max_iterations"] if hyper_params and "max_iterations" in hyper_params else 100) apt = (hyper_params["all_possible trainsitions"] if hyper_params and "max_iterations" in hyper_params else True) self.fe = FeatureExtractor() self.crf = CRF( algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iters, all_possible_transitions=apt, ) def save_model(self, output_path: str): """ save model """ with open(output_path, "wb") as f: pickle.dump(self.crf, f) def load_model(self, model_path: str): """ load model """ with open(model_path, "rb") as f: self.crf = pickle.load(f) def train(self, sentences: List[Sentence]): """ execute training """ x = [self.fe.extract_feature(s) for s in sentences] y = [s.labels for s in sentences] self.crf.fit(x, y) def evaluate(self, sentences: List[Sentence], fix_invalid_labels=True, decoder='greedy', k=5, debug=False): """ return predicted class """ x = [self.fe.extract_feature(s) for s in sentences] y_test = [s.labels for s in sentences] y_pred = self.__predict(x, decoder, k) if debug: for t, p in list(zip(y_test, y_pred))[:10]: print(t) print(p) print('raw, ', decoder) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted(LABELS), digits=3)) if fix_invalid_labels: print('fix-invalid, ', decoder) y_pred = self.fix_labels(y_pred) if debug: for t, p in list(zip(y_test, y_pred))[:10]: print(t) print(p) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted(LABELS), digits=3)) return metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=LABELS) def fix_labels(self, labels): labels_fixed = [] with multiprocessing.Pool() as p: labels_fixed.append(p.map(_fix_valid, labels)) return labels_fixed[0] def __predict(self, x, decoder='greedy', k=5): if decoder == 'greedy': return self.crf.predict(x) else: y_probs = self.__predict_prob(x) # (n_data, max_sequence_length, n_labels) -> (n_data, k, max_sequence_length) args = [(probability, k) for probability in y_probs] results = [] with multiprocessing.Pool(multiprocessing.cpu_count()) as p: results.append(p.map(_beam_search_decoder, args)) beams_list = results[0] # validのみ残す args = [[[LABELS[idx - 1] if idx > 0 else 'O' for idx in beam[0]] for beam in beams] for beams in beams_list] labels_selected = [] with multiprocessing.Pool(multiprocessing.cpu_count()) as p: labels_selected.append(p.map(_filter_valid, args)) labels_selected = labels_selected[0] return labels_selected def predict(self, sentences: List[Sentence], decoder='greedy', k=5): """ return predicted class """ x = [self.fe.extract_feature(s) for s in sentences] return self.__predict(x, decoder, k) def __predict_prob(self, x): probs = self.crf.predict_marginals(x) # (n_data, max_sequence_length, n_labels) return [[[p_token['O']] + [p_token[label] for label in LABELS] for p_token in prob] for prob in probs] def predict_prob(self, sentences: List[Sentence]): """ return probabilities for each class :param data: :return: """ x = [self.fe.extract_feature(s) for s in sentences] return self.__predict_prob(x)
class BiLSTM_CRF(): def __init__(self, data): print "build batched lstmcrf..." self.label_alphabet = data.label_alphabet self.word_alphabet = data.word_alphabet self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) self.examiner = Examiner(data) self.useExaminer = False self.loss_function = nn.NLLLoss() self.topk = 5 self.X_train = [] self.Y_train = [] self.pos_mask_list = [] self.instances = [] self.scores_refs = [] self.pos_mask = None self.tag_size = data.label_alphabet_size #For the afterward updating of the crf def masked_label(self, pos_mask, mask, batch_label, tag_seq): batch_label = batch_label.mul(1 - pos_mask) tag_seq = Variable(tag_seq).cuda().mul(pos_mask) return batch_label + tag_seq def ner(sentence): sentence_features = [ self.features(sentence, index) for index in range(len(sentence)) ] return list(zip(sentence, model.predict([sentence_features])[0])) def rand_mask(self, word_inputs, mask): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_word = batch_size * seq_len if self.full == True: return Variable(torch.zeros(batch_size, seq_len).cuda().long(), requires_grad=False) rand_vec = Variable(torch.rand(batch_size, seq_len), requires_grad=False) rand_vec = mask.float() * rand_vec.cuda() if seq_len >= self.topk: topk, indices = rand_vec.topk(self.topk, dim=1) else: topk, indices = rand_vec.topk(seq_len, dim=1) pos_mask = Variable(torch.ones(batch_size, seq_len).cuda()) pos_mask = pos_mask.scatter(1, indices, 0).long() return pos_mask #For the afterward updating of the crf def sent2features(self, sent): return [self.features(sent, i) for i in range(len(sent))] def sent2labels(sent): return [label for token, postag, label in sent] def sent2tokens(sent): return [token for token, postag, label in sent] def tensor_to_sequence(self, _alphabet, word_inputs, label=True): #seq_len = word_inputs.size(1) if label == True: return [[ _alphabet.get_instance(x.data[0]) for x in word_inputs[0] ]] else: return [ self.sent2features([ _alphabet.get_instance(x.data[0]) for x in word_inputs[0] ]) ] def sequence_to_tensor(self, _alphabet, word_inputs): return torch.LongTensor( [[_alphabet.get_index(x) for x in word_inputs[0]]]) def crf_loss(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask, t=None, pos_mask=None): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) #get score and tag_seq #outs = self.lstm.get_output_score(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) tag_seq = self.sequence_to_tensor( self.label_alphabet, self.crf.predict( self.tensor_to_sequence(self.word_alphabet, word_inputs, label=False))) hehe = self.crf.predict_marginals( self.tensor_to_sequence(self.word_alphabet, word_inputs, label=False)) #print("hehe",self.label_alphabet.instances) #print("gg",word_seq_lengths[0]) tag_prob = Variable( torch.zeros(1, word_seq_lengths[0], self.tag_size).cuda()) j = 0 for key in self.label_alphabet.instances: for i in range(word_seq_lengths[0]): if key in hehe[0][i]: tag_prob[0, i, j] = hehe[0][i][key] else: tag_prob[0, i, j] = 0.0 j += 1 if t != None: t_mask = self.pos_mask_list[t] #print("t_mask",t_mask) indices, pos_mask, scores_ref, score, correct = self.examiner.neg_log_likelihood_loss( word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, tag_seq, tag_prob, mask * t_mask.byte()) else: indices, pos_mask, scores_ref, score, correct = self.examiner.neg_log_likelihood_loss( word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, tag_seq, tag_prob, mask) #pos_mask = self.rand_mask(word_inputs,mask)#currently we are using random mask self.pos_mask = pos_mask batch_label = self.masked_label(pos_mask, mask, batch_label, tag_seq) #total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) return batch_label, tag_seq, tag_prob, pos_mask, score, indices, scores_ref def add_instance(self, word_inputs, batch_label, pos_mask, instance, scores_ref): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) self.X_train.append( self.tensor_to_sequence(self.word_alphabet, word_inputs, label=False)[0]) self.Y_train.append( self.tensor_to_sequence(self.label_alphabet, batch_label)[0]) #print("self.tag_mask",self.tag_mask.size()) if pos_mask is None: self.pos_mask_list.append( Variable(torch.zeros(batch_size, seq_len).long()).cuda()) else: self.pos_mask_list.append(pos_mask) self.instances.append(instance) self.scores_refs.append(scores_ref) def readd_instance(self, batch_label, mask, pos_mask, i, scores_ref): tag_seq = self.sequence_to_tensor(self.label_alphabet, self.crf.predict([self.X_train[i]])) pos_mask = self.pos_mask_list[i].long() * pos_mask.long() batch_label = self.masked_label(pos_mask, mask, batch_label, tag_seq) self.Y_train[i] = self.tensor_to_sequence(self.label_alphabet, batch_label)[0] self.pos_mask_list[i] = pos_mask self.scores_refs[i] = scores_ref def reinforment_reward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, tag_seq, tag_prob, mask): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) indices, pos_mask, scores_ref, score, correct = self.examiner.neg_log_likelihood_loss( word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, tag_seq, tag_prob, mask) return pos_mask, scores_ref, (score * (1 - pos_mask.float())).sum() def reinforment_supervised(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, tag_seq, tag_prob, mask): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) #get score and tag_seq #outs = self.lstm.get_output_score(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) #tag_seq = self.sequence_to_tensor(self.label_alphabet,self.crf.predict(self.tensor_to_sequence(self.word_alphabet,word_inputs,label=False))) #print("tag_seq",score) #print(batch_label) #get_selected position indices, pos_mask, scores_ref, score, correct = self.examiner.neg_log_likelihood_loss( word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, tag_seq, tag_prob, mask) return pos_mask, score def pop_instance(self, x): self.X_train.pop(0) self.Y_train.pop(0) def reevaluate_instance(self, mask): for i in range(len(self.X_train)): #X_train[i] tag_seq = self.sequence_to_tensor( self.label_alphabet, self.crf.predict([self.X_train[i]])) pos_mask = self.pos_mask_list[i] batch_label = self.masked_label( pos_mask, mask, Variable( self.sequence_to_tensor(self.label_alphabet, [self.Y_train[i]])).cuda(), tag_seq) self.Y_train[i] = self.tensor_to_sequence(self.label_alphabet, batch_label)[0] def features(self, sent, i): # obtain some overall information of the point name string num_part = 4 len_string = len(sent) mod = len_string % num_part part_size = int(math.floor(len_string / num_part)) # determine which part the current character belongs to # larger part will be at the beginning if the whole sequence can't be divided evenly size_list = [] mod_count = 0 for j in range(num_part): if mod_count < mod: size_list.append(part_size + 1) mod_count += 1 else: size_list.append(part_size) # for current character part_cumulative = [0] * num_part for j in range(num_part): if j > 0: part_cumulative[j] = part_cumulative[j - 1] + size_list[j] else: part_cumulative[j] = size_list[j] - 1 # indices start from 0 part_indicator = [0] * num_part for j in range(num_part): if part_cumulative[j] >= i: part_indicator[j] = 1 break word = sent[i][0] if word.isdigit(): itself = 'NUM' else: itself = word features = { 'word': itself, 'part0': part_indicator[0] == 1, 'part1': part_indicator[1] == 1, 'part2': part_indicator[2] == 1, 'part3': part_indicator[3] == 1, } # for previous character if i > 0: part_indicator = [0] * num_part for j in range(num_part): if part_cumulative[j] >= i - 1: part_indicator[j] = 1 break word1 = sent[i - 1] if word1.isdigit(): itself1 = 'NUM' else: itself1 = word1 features.update({ '-1:word': itself1, '-1:part0': part_indicator[0] == 1, '-1:part1': part_indicator[1] == 1, '-1:part2': part_indicator[2] == 1, '-1:part3': part_indicator[3] == 1, }) else: features['BOS'] = True # for next character if i < len(sent) - 1: part_indicator = [0] * num_part for j in range(num_part): if part_cumulative[j] >= i + 1: part_indicator[j] = 1 break word1 = sent[i + 1] if word1.isdigit(): itself1 = 'NUM' else: itself1 = word1 features.update({ '+1:word': itself1, '+1:part0': part_indicator[0] == 1, '+1:part1': part_indicator[1] == 1, '+1:part2': part_indicator[2] == 1, '+1:part3': part_indicator[3] == 1, }) else: features['EOS'] = True return features # def train(self): self.crf.fit(self.X_train, self.Y_train) return def sample_train(self, left, right): self.crf.fit(self.X_train[left:right], self.Y_train[left:right]) return def test(self, word_inputs): tag_seq = self.sequence_to_tensor( self.label_alphabet, self.crf.predict( self.tensor_to_sequence(self.word_alphabet, word_inputs, label=False))) return Variable(tag_seq).cuda()