Exemple #1
0
class HMMClassifier(object):
    def __init__(self, **kwarg):    # lname, url, other prior knowledge
        super(HMMClassifier, self).__init__()
        self.HMMauthor = HMM('author', 2)
        self.HMMvenue = HMM('venue', 2)     # Not important
        self.HMMentire = HMM('entire', 6)   # Set empirically
        self.observations_raw = []
        self.observation_sequences = []
        self.labels = []


    def predict(self, segment):
        author_likelihood = self.HMMauthor.evaluate(segment)
        venue_likelihood = self.HMMvenue.evaluate(segment)
        print segment
        print 'author likelihood:\t' , author_likelihood
        print 'venue likelihood:\t' , venue_likelihood

    def decode(self, segment):
        # print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode(segment)
        
        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)


        # segment the labeling into parts
        author_field = []
        title_field = []
        venue_field = []
        year_field = []
        raw_tokens = Tokens(segment).tokens
        for i in range(len(decoded_sequence)):
            token_i = raw_tokens[i]
            label_i = decoded_sequence[i]
            if label_i in [0,1]:
                author_field.append(token_i)
            if label_i == 2:
                continue
            if label_i == 3:
                title_field.append(token_i)
            if label_i == 4:
                venue_field.append(token_i)
            if label_i == 5:
                year_field.append(token_i)

        return ' '.join(author_field), ' '.join(title_field), ' '.join(venue_field), list(set(year_field))
        # Additional step: to calculate the overall sum of P(X1|FN,LN,DL...) + P(X2|TI,TI,TI...) + P(X3|VN,VN,VN...) + P(X4|DT)
        # 1. Find boundaries: 
        # boundaries = [[], [], []]
        # label_ranges = [[0,1,2], [3], [2,4,5]]
        # for i in range(len(label_ranges)):
        #     label_range = label_ranges[i]
        #     for j in range(len(decoded_sequence)):
        #         if decoded_sequence[j] in label_range:
        #             boundaries[i].append()

    
    def decode_without_constraints(self, segment):
        print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints(segment)
        
        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        for vector, decoding, token in zip(observation_sequence, decoded_sequence, Tokens(segment).tokens):
            if decoding == 0:
                label = 'FN'
            elif decoding == 1:
                label = 'LN'
            elif decoding == 2:
                label = 'DL'
            elif decoding == 3:
                label = 'TI'
            elif decoding == 4:
                label = 'VN'
            elif decoding == 5:
                label = 'YR'
            else:
                label = str(decoding) + ', PROBLEM'
            print vector, '\t', label, '\t', token
        print '\n\n'

    def cross_correct(self):
        absolute_correct = []
        absolute_wrong = []

        # 1. Confirm what's the big structure of the publication inside this specific domain
        counter = {}
        for l in self.labels:
            first_label = str(l[0])
            if counter.has_key(first_label):
                counter[first_label] += 1
            else:
                counter[first_label] = 1
        sorted_counter = sorted(counter.iteritems(), key=operator.itemgetter(1), reverse=True)
        print 'First labels distribution: ', sorted_counter


    def serialize(self):
        fp = open('hmmc.pkl', 'wb')
        pickle.dump(self, fp, -1)
        fp.close()
Exemple #2
0
class HMMClassifier(object):
    def __init__(self, **kwarg):  # lname, url, other prior knowledge
        super(HMMClassifier, self).__init__()
        self.HMMauthor = HMM('author', 2)
        self.HMMvenue = HMM('venue', 2)  # Not important
        self.HMMentire = HMM('entire', 6)  # Set empirically
        self.observations_raw = []
        self.observation_sequences = []
        self.labels = []

    def predict(self, segment):
        author_likelihood = self.HMMauthor.evaluate(segment)
        venue_likelihood = self.HMMvenue.evaluate(segment)
        print segment
        print 'author likelihood:\t', author_likelihood
        print 'venue likelihood:\t', venue_likelihood

    def decode(self, segment):
        # print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode(segment)

        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        # segment the labeling into parts
        author_field = []
        title_field = []
        venue_field = []
        year_field = []
        raw_tokens = Tokens(segment).tokens
        for i in range(len(decoded_sequence)):
            token_i = raw_tokens[i]
            label_i = decoded_sequence[i]
            if label_i in [0, 1]:
                author_field.append(token_i)
            if label_i == 2:
                continue
            if label_i == 3:
                title_field.append(token_i)
            if label_i == 4:
                venue_field.append(token_i)
            if label_i == 5:
                year_field.append(token_i)

        return ' '.join(author_field), ' '.join(title_field), ' '.join(
            venue_field), list(set(year_field))
        # Additional step: to calculate the overall sum of P(X1|FN,LN,DL...) + P(X2|TI,TI,TI...) + P(X3|VN,VN,VN...) + P(X4|DT)
        # 1. Find boundaries:
        # boundaries = [[], [], []]
        # label_ranges = [[0,1,2], [3], [2,4,5]]
        # for i in range(len(label_ranges)):
        #     label_range = label_ranges[i]
        #     for j in range(len(decoded_sequence)):
        #         if decoded_sequence[j] in label_range:
        #             boundaries[i].append()

    def decode_without_constraints(self, segment):
        print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints(
            segment)

        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        for vector, decoding, token in zip(observation_sequence,
                                           decoded_sequence,
                                           Tokens(segment).tokens):
            if decoding == 0:
                label = 'FN'
            elif decoding == 1:
                label = 'LN'
            elif decoding == 2:
                label = 'DL'
            elif decoding == 3:
                label = 'TI'
            elif decoding == 4:
                label = 'VN'
            elif decoding == 5:
                label = 'YR'
            else:
                label = str(decoding) + ', PROBLEM'
            print vector, '\t', label, '\t', token
        print '\n\n'

    def cross_correct(self):
        absolute_correct = []
        absolute_wrong = []

        # 1. Confirm what's the big structure of the publication inside this specific domain
        counter = {}
        for l in self.labels:
            first_label = str(l[0])
            if counter.has_key(first_label):
                counter[first_label] += 1
            else:
                counter[first_label] = 1
        sorted_counter = sorted(counter.iteritems(),
                                key=operator.itemgetter(1),
                                reverse=True)
        print 'First labels distribution: ', sorted_counter

    def serialize(self):
        fp = open('hmmc.pkl', 'wb')
        pickle.dump(self, fp, -1)
        fp.close()
Exemple #3
0
    if load_entities is True:
        data_train = [[(t[0], t[2]) for t in sent] for sent in data_train]
        data_test = [[(t[0], t[2]) for t in sent] for sent in data_test]
    else:
        data_train = [[(t[0], t[1]) for t in sent] for sent in data_train]
        data_test = [[(t[0], t[1]) for t in sent] for sent in data_test]

    hmm = HMM()
    start_time = time.time()
    hmm.fit(data_train)
    print(f"Duration of training: {time.time() - start_time}")

    # evaluation hmm
    # -------------------------------------------------------------------------
    # plot confusion matrix, calculate precision, recall, f1-score
    hmm.evaluate(data_test)
    # show misclassifications
    features_test, labels_test = separate_labels_from_features(data_test)
    predictions = hmm.predict(features_test)
    show_misclassifications(data_test, predictions)

elif model_type == "NB":
    # fit naive bayes model
    # -------------------------------------------------------------------------
    nb = Naive_Bayes()
    data_train_featurized = feature_maker.get_pos_features_nltk(
        data_train
    ) if not load_entities else feature_maker.get_ner_features_nltk(data_train)

    data_train_featurized = flatten(data_train_featurized)
    start_time = time.time()
Exemple #4
0
    corpus = Corpus(config.ftrain)
    print(corpus)

    print("Load the dataset")
    trainset = corpus.load(config.ftrain)
    devset = corpus.load(config.fdev)
    print("  size of trainset: %d\n"
          "  size of devset: %d" % (len(trainset), len(devset)))
    if args.bigdata:
        testset = corpus.load(config.ftest)
        print("  size of testset: %d" % len(testset))

    start = datetime.now()

    print("Create HMM")
    hmm = HMM(corpus.nw, corpus.nt)

    print("Use %d sentences to train the HMM" % corpus.ns)
    hmm.train(trainset=trainset, alpha=config.alpha, file=args.file)

    print("Use Viterbi algorithm to tag the dataset")
    tp, total, accuracy = hmm.evaluate(devset)
    print("Accuracy of dev: %d / %d = %4f\n" % (tp, total, accuracy))

    if args.bigdata:
        hmm = HMM.load(args.file)
        tp, total, accuracy = hmm.evaluate(testset)
        print("Accuracy of test: %d / %d = %4f" % (tp, total, accuracy))

    print("%ss elapsed" % (datetime.now() - start))
Exemple #5
0
model = HMM()
model.init_random(3,4)

print model.trans
for line in model.trans:
    print sum(line)

print model.emit
for line in model.emit:
    print sum(line)

def dataFormatter(filename):
    with open(filename,'r') as f:
        tmp = [ list(s) for s in f.read().split()]
        sample = []
        for seq in tmp:
            seq = map(lambda x:ord(x)-ord('A'), seq)
            sample.append(seq)
        return sample

#sample = sample[0]
sample = dataFormatter('hmm_test1.in')
print sample

model.train(sample)

forEvaluation = dataFormatter('hmm_evaluation.in')

for seq in forEvaluation:
    print model.evaluate(seq)