class HMMClassifier(object): def __init__(self, **kwarg): # lname, url, other prior knowledge super(HMMClassifier, self).__init__() self.HMMauthor = HMM('author', 2) self.HMMvenue = HMM('venue', 2) # Not important self.HMMentire = HMM('entire', 6) # Set empirically self.observations_raw = [] self.observation_sequences = [] self.labels = [] def predict(self, segment): author_likelihood = self.HMMauthor.evaluate(segment) venue_likelihood = self.HMMvenue.evaluate(segment) print segment print 'author likelihood:\t' , author_likelihood print 'venue likelihood:\t' , venue_likelihood def decode(self, segment): # print segment observation_sequence, decoded_sequence = self.HMMentire.decode(segment) self.observations_raw.append(segment) self.observation_sequences.append(observation_sequence) self.labels.append(decoded_sequence) # segment the labeling into parts author_field = [] title_field = [] venue_field = [] year_field = [] raw_tokens = Tokens(segment).tokens for i in range(len(decoded_sequence)): token_i = raw_tokens[i] label_i = decoded_sequence[i] if label_i in [0,1]: author_field.append(token_i) if label_i == 2: continue if label_i == 3: title_field.append(token_i) if label_i == 4: venue_field.append(token_i) if label_i == 5: year_field.append(token_i) return ' '.join(author_field), ' '.join(title_field), ' '.join(venue_field), list(set(year_field)) # Additional step: to calculate the overall sum of P(X1|FN,LN,DL...) + P(X2|TI,TI,TI...) + P(X3|VN,VN,VN...) + P(X4|DT) # 1. Find boundaries: # boundaries = [[], [], []] # label_ranges = [[0,1,2], [3], [2,4,5]] # for i in range(len(label_ranges)): # label_range = label_ranges[i] # for j in range(len(decoded_sequence)): # if decoded_sequence[j] in label_range: # boundaries[i].append() def decode_without_constraints(self, segment): print segment observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints(segment) self.observations_raw.append(segment) self.observation_sequences.append(observation_sequence) self.labels.append(decoded_sequence) for vector, decoding, token in zip(observation_sequence, decoded_sequence, Tokens(segment).tokens): if decoding == 0: label = 'FN' elif decoding == 1: label = 'LN' elif decoding == 2: label = 'DL' elif decoding == 3: label = 'TI' elif decoding == 4: label = 'VN' elif decoding == 5: label = 'YR' else: label = str(decoding) + ', PROBLEM' print vector, '\t', label, '\t', token print '\n\n' def cross_correct(self): absolute_correct = [] absolute_wrong = [] # 1. Confirm what's the big structure of the publication inside this specific domain counter = {} for l in self.labels: first_label = str(l[0]) if counter.has_key(first_label): counter[first_label] += 1 else: counter[first_label] = 1 sorted_counter = sorted(counter.iteritems(), key=operator.itemgetter(1), reverse=True) print 'First labels distribution: ', sorted_counter def serialize(self): fp = open('hmmc.pkl', 'wb') pickle.dump(self, fp, -1) fp.close()
class HMMClassifier(object): def __init__(self, **kwarg): # lname, url, other prior knowledge super(HMMClassifier, self).__init__() self.HMMauthor = HMM('author', 2) self.HMMvenue = HMM('venue', 2) # Not important self.HMMentire = HMM('entire', 6) # Set empirically self.observations_raw = [] self.observation_sequences = [] self.labels = [] def predict(self, segment): author_likelihood = self.HMMauthor.evaluate(segment) venue_likelihood = self.HMMvenue.evaluate(segment) print segment print 'author likelihood:\t', author_likelihood print 'venue likelihood:\t', venue_likelihood def decode(self, segment): # print segment observation_sequence, decoded_sequence = self.HMMentire.decode(segment) self.observations_raw.append(segment) self.observation_sequences.append(observation_sequence) self.labels.append(decoded_sequence) # segment the labeling into parts author_field = [] title_field = [] venue_field = [] year_field = [] raw_tokens = Tokens(segment).tokens for i in range(len(decoded_sequence)): token_i = raw_tokens[i] label_i = decoded_sequence[i] if label_i in [0, 1]: author_field.append(token_i) if label_i == 2: continue if label_i == 3: title_field.append(token_i) if label_i == 4: venue_field.append(token_i) if label_i == 5: year_field.append(token_i) return ' '.join(author_field), ' '.join(title_field), ' '.join( venue_field), list(set(year_field)) # Additional step: to calculate the overall sum of P(X1|FN,LN,DL...) + P(X2|TI,TI,TI...) + P(X3|VN,VN,VN...) + P(X4|DT) # 1. Find boundaries: # boundaries = [[], [], []] # label_ranges = [[0,1,2], [3], [2,4,5]] # for i in range(len(label_ranges)): # label_range = label_ranges[i] # for j in range(len(decoded_sequence)): # if decoded_sequence[j] in label_range: # boundaries[i].append() def decode_without_constraints(self, segment): print segment observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints( segment) self.observations_raw.append(segment) self.observation_sequences.append(observation_sequence) self.labels.append(decoded_sequence) for vector, decoding, token in zip(observation_sequence, decoded_sequence, Tokens(segment).tokens): if decoding == 0: label = 'FN' elif decoding == 1: label = 'LN' elif decoding == 2: label = 'DL' elif decoding == 3: label = 'TI' elif decoding == 4: label = 'VN' elif decoding == 5: label = 'YR' else: label = str(decoding) + ', PROBLEM' print vector, '\t', label, '\t', token print '\n\n' def cross_correct(self): absolute_correct = [] absolute_wrong = [] # 1. Confirm what's the big structure of the publication inside this specific domain counter = {} for l in self.labels: first_label = str(l[0]) if counter.has_key(first_label): counter[first_label] += 1 else: counter[first_label] = 1 sorted_counter = sorted(counter.iteritems(), key=operator.itemgetter(1), reverse=True) print 'First labels distribution: ', sorted_counter def serialize(self): fp = open('hmmc.pkl', 'wb') pickle.dump(self, fp, -1) fp.close()
if load_entities is True: data_train = [[(t[0], t[2]) for t in sent] for sent in data_train] data_test = [[(t[0], t[2]) for t in sent] for sent in data_test] else: data_train = [[(t[0], t[1]) for t in sent] for sent in data_train] data_test = [[(t[0], t[1]) for t in sent] for sent in data_test] hmm = HMM() start_time = time.time() hmm.fit(data_train) print(f"Duration of training: {time.time() - start_time}") # evaluation hmm # ------------------------------------------------------------------------- # plot confusion matrix, calculate precision, recall, f1-score hmm.evaluate(data_test) # show misclassifications features_test, labels_test = separate_labels_from_features(data_test) predictions = hmm.predict(features_test) show_misclassifications(data_test, predictions) elif model_type == "NB": # fit naive bayes model # ------------------------------------------------------------------------- nb = Naive_Bayes() data_train_featurized = feature_maker.get_pos_features_nltk( data_train ) if not load_entities else feature_maker.get_ner_features_nltk(data_train) data_train_featurized = flatten(data_train_featurized) start_time = time.time()
corpus = Corpus(config.ftrain) print(corpus) print("Load the dataset") trainset = corpus.load(config.ftrain) devset = corpus.load(config.fdev) print(" size of trainset: %d\n" " size of devset: %d" % (len(trainset), len(devset))) if args.bigdata: testset = corpus.load(config.ftest) print(" size of testset: %d" % len(testset)) start = datetime.now() print("Create HMM") hmm = HMM(corpus.nw, corpus.nt) print("Use %d sentences to train the HMM" % corpus.ns) hmm.train(trainset=trainset, alpha=config.alpha, file=args.file) print("Use Viterbi algorithm to tag the dataset") tp, total, accuracy = hmm.evaluate(devset) print("Accuracy of dev: %d / %d = %4f\n" % (tp, total, accuracy)) if args.bigdata: hmm = HMM.load(args.file) tp, total, accuracy = hmm.evaluate(testset) print("Accuracy of test: %d / %d = %4f" % (tp, total, accuracy)) print("%ss elapsed" % (datetime.now() - start))
model = HMM() model.init_random(3,4) print model.trans for line in model.trans: print sum(line) print model.emit for line in model.emit: print sum(line) def dataFormatter(filename): with open(filename,'r') as f: tmp = [ list(s) for s in f.read().split()] sample = [] for seq in tmp: seq = map(lambda x:ord(x)-ord('A'), seq) sample.append(seq) return sample #sample = sample[0] sample = dataFormatter('hmm_test1.in') print sample model.train(sample) forEvaluation = dataFormatter('hmm_evaluation.in') for seq in forEvaluation: print model.evaluate(seq)