class MutualBootStrapper: def __init__(self, data, seeds, patterns=None, processing=1): if processing == 0: tokenized = self.tokenize(data) self.pos_tagged_data = self.pos_tag(tokenized) self.find_patterns = self.find_patterns_tagged self.find_seeds = self.find_seeds_tagged elif processing == 1: self.chunked_data = data self.find_patterns = self.find_patterns_chunked self.find_seeds = self.find_seeds_chunked self.permanent_lexicon = set(seeds) self.temporary_lexicon = defaultdict(set) for s in seeds: self.temporary_lexicon[s] = set() self.best_extraction_patterns = set() self.pattern_alphabet = Alphabet() if patterns is not None: for p in patterns: self.pattern_alphabet.add(p) self.n_counter_sets = None # import for getting candidate seeds self.f_counter_sets = None self.n_pattern_array = None self.f_pattern_array = None self.first_pattern_words = set() def tokenize(self, text): print "tokenizing...", all_entries = [] for entry in text: tokenized_entry = self._nested_tokenize(entry) all_entries.append(tokenized_entry) print "[DONE]" return all_entries def _nested_tokenize(self, untokenized_sentences): tokenized_sents = nltk.sent_tokenize(untokenized_sentences) tokenized_words = [nltk.word_tokenize(sent) for sent in tokenized_sents] self._postprocess_tokenized_text(tokenized_words) return tokenized_words def _postprocess_tokenized_text(self, tokenized): for i,sent in enumerate(tokenized): for j,word in enumerate(sent): tokenized[i][j] = word.lower() if "/" in word: tokenized[i][j] = re.sub(r"/", r" / ", word) #mutating the list def pos_tag(self, tokenized_data): print "POS tagging... ", pos_tagged_data = [] for entry in tokenized_data: new_entry = [] for sentence in entry: tagged = [("<START>", "<START>")] tagged.extend(nltk.pos_tag(sentence)) new_entry.append(tagged) pos_tagged_data.append(new_entry) print "[DONE]" return pos_tagged_data def build_patterns_tagged(self, sentence, index, size): window_start = index-size window_end = index+1 sentence_copy = list(sentence) sentence_copy[index] = "<x>", while window_start <= index: # this isn't quite right try: candidate = zip(*sentence_copy[window_start:window_end])[0] except IndexError: candidate = [] if len(candidate) > 1: self.pattern_alphabet.add(tuple(candidate)) if candidate[0] != "<x>": self.first_pattern_words.add(candidate[0]) else: self.first_pattern_words.add(candidate[1]) window_start += 1 window_end += 1 def find_patterns_tagged(self): for entry in self.pos_tagged_data: for sentence in entry: for i,(word,tag) in enumerate(sentence): if word in self.temporary_lexicon: self.build_patterns_tagged(sentence, i, 2) self.build_patterns_tagged(sentence, i, 1) def find_patterns_chunked(self): for entry in self.chunked_data: for sentence in entry: for i,word in enumerate(sentence): if isinstance(word, Chunk) and word.head in self.temporary_lexicon: self.build_patterns_chunked(sentence, i, 2) self.build_patterns_chunked(sentence, i, 1) def build_patterns_chunked(self, sentence, index, size): sentence_copy = list(sentence) sentence_copy[index] = "<x>", sentence_copy = self._flatten_chunks(sentence_copy) index = sentence_copy.index("<x>") window_start = index-size window_end = index+1 while window_start <= index: candidate = sentence_copy[window_start:window_end] if len(candidate) > 1: self.pattern_alphabet.add(tuple(candidate)) window_start += 1 window_end += 1 def _flatten_chunks(self, sentence): flattened_sentence = [] for constituent in sentence: if isinstance(constituent, Chunk): flattened_sentence.extend(constituent.tokens) else: flattened_sentence.append(constituent[0]) return flattened_sentence def set_counter_arrays(self): tmp_lst = [[]] * self.pattern_alphabet.size() # must be careful about pointers here self.n_counter_sets = map(set, tmp_lst) self.f_counter_sets = map(set, tmp_lst) def find_seeds_chunked(self): for entry in self.chunked_data: for sentence in entry: for i in range(len(sentence)): if isinstance(sentence[i], Chunk): self.match_pattern_chunked(sentence, i, 2) self.match_pattern_chunked(sentence, i, 1) def match_pattern_chunked(self, sentence, index, size): candidate_seed = sentence[index].head sentence_copy = list(sentence) sentence_copy[index] = "<x>", sentence_copy = self._flatten_chunks(sentence_copy) index = sentence_copy.index("<x>") window_start = index-size window_end = index+1 while window_start <= index: window = sentence_copy[window_start:window_end] pattern = tuple(window) if len(pattern) > 1 and \ self.pattern_alphabet.has_label(pattern) and \ len(candidate_seed) > 2: pattern_index = self.pattern_alphabet.get_index(pattern) # increment our counters self.n_counter_sets[pattern_index].add(candidate_seed) if candidate_seed not in self.temporary_lexicon: self.f_counter_sets[pattern_index].add(candidate_seed) window_start += 1 window_end += 1 def find_seeds_tagged(self): for entry in self.pos_tagged_data: for sentence in entry: for i in range(len(sentence)): if sentence[i][0] in self.first_pattern_words: self.match_pattern_tagged(sentence, i, 3) self.match_pattern_tagged(sentence, i, 2) def match_pattern_tagged(self, sentence, index, size): window_start = index-1 window_end = index+size-1 window = sentence[window_start:window_end] for seed_candidate_index in range(len(window)): window_copy = list(window) _,pos = window_copy[seed_candidate_index] window_copy[seed_candidate_index] = ("<x>", pos) pattern = tuple(zip(*window_copy)[0]) if len(pattern) > 1 and \ self.pattern_alphabet.has_label(pattern) and \ window[seed_candidate_index][1].startswith("NN") and \ len(window[seed_candidate_index][0]) > 2: candidate_seed = window[seed_candidate_index][0] pattern_index = self.pattern_alphabet.get_index(pattern) # increment our counters self.n_counter_sets[pattern_index].add(candidate_seed) if candidate_seed not in self.temporary_lexicon: self.f_counter_sets[pattern_index].add(candidate_seed) def calculate_pattern_scores(self): self.n_pattern_array = numpy.array(map(len, self.n_counter_sets), dtype=float) + 1. self.f_pattern_array = numpy.array(map(len, self.f_counter_sets), dtype=float) + 1. self.pattern_scores = numpy.nan_to_num((self.f_pattern_array/self.n_pattern_array)*numpy.log2(self.f_pattern_array)) def calculate_seed_scores(self): self.candidate_seed_scores = {} for candidate_seed,matched_patterns_set in self.temporary_lexicon.iteritems(): matched_patterns = list(matched_patterns_set) score = numpy.sum((self.pattern_scores[matched_patterns] * 0.01) + 1) #print score self.candidate_seed_scores[candidate_seed] = score def cull_candidates(self): self.calculate_pattern_scores() self.calculate_seed_scores() sorted_candidates = sorted([(v,k) for k,v in self.candidate_seed_scores.iteritems()], reverse=True) #print sorted_candidates try: return zip(*sorted_candidates)[1][:5] except IndexError: return [] def run_mutual_bootstrapping(self): added_patterns = 0 best_score = 5 while added_patterns < 10 or best_score > 1.8: self.find_patterns() self.set_counter_arrays() self.find_seeds() self.calculate_pattern_scores() best_pattern_index = numpy.nanargmax(self.pattern_scores) while best_pattern_index in self.best_extraction_patterns: self.pattern_scores[best_pattern_index] = -10000000. best_pattern_index = numpy.nanargmax(self.pattern_scores) if self.pattern_scores[best_pattern_index] < 0.7: return best_score = self.pattern_scores[best_pattern_index] #print best_score, self.pattern_alphabet.get_label(best_pattern_index) self.best_extraction_patterns.add(best_pattern_index) for seed in self.n_counter_sets[best_pattern_index]: self.temporary_lexicon[seed].add(best_pattern_index) added_patterns += 1 def run_meta_bootstrapping(self): best_five = self.cull_candidates() self.permanent_lexicon.update(best_five) self.temporary_lexicon = defaultdict(set) for s in self.permanent_lexicon: self.temporary_lexicon[s] = set() def run(self, num_iterations=50): for i in range(num_iterations): print "Iteration: {:d}".format(i+1) print "running mutual bootstrapping..." self.run_mutual_bootstrapping() print "[DONE]" print "running meta bootstrapping...", self.run_meta_bootstrapping() print "[DONE]" print "number of seed terms: {:d}".format(len(self.permanent_lexicon)) print "number of total patterns: {:d}".format(self.pattern_alphabet.size()) print "\n" def save_seeds(self, outfile): with open(outfile, "w") as f_out: f_out.write("\n".join(s.encode("utf-8") for s in self.permanent_lexicon)) def save_patterns(self, outfile): with open(outfile, "w") as f_out: patterns = [] for pattern_index in self.best_extraction_patterns: patterns.append(" ".join(self.pattern_alphabet.get_label(pattern_index))) f_out.write("\n".join(s.encode("utf-8") for s in patterns))
class Parser: def __init__(self, feature_generator_list, decay = False): self.feature_generator_list = feature_generator_list self.feature_alphabet = Alphabet() self.label_alphabet = Alphabet() #you will need this if you use labeled arc self.weights = None self.learning_rate = 0.0001 self.num_iterations = 10 self.caches = {} self.decay = decay def featurize(self, src, dst, sentence, grow_alphabets=True): """Generate feature indices for an arc from src->dst Arg: Arc from src(index)->dst(index) sentence is a dictionary in which you can put whatever in. """ feature_list = [] for feature_generator in self.feature_generator_list: feature_list.extend(feature_generator(src, dst, sentence)) if grow_alphabets: #set to false when running this function on dev/test set for feature, bias in feature_list: self.feature_alphabet.add(feature) for src,dst,label in sentence['arcs']: self.label_alphabet.add(label) feature_vector = [(self.feature_alphabet.get_index(x), feature_value) for x, feature_value in feature_list \ if self.feature_alphabet.has_label(x)] return ([x for x,y in feature_vector], numpy.array([y for x,y in feature_vector])) def make_fully_connected_graph(self, sentence): """Make a graph to make an MST from If G is such graph, then the weight for an arc from token i to token j is G[i][j] i.e. G is a diction and G[i] is also a dictionary. If arc i->j does not exist, then j not in G[i]. You will need to use self.featurize for all possible edges Arg: sentence is a dictionary in which you can put whatever in. Add your implementation """ G = {} #get a list of indices indices = range(len(sentence['tokens'])) #make an arc for each pair for i in indices: G[i] = {} for j in indices: if i != j: G[i][j] = self.featurize(int(i), int(j), sentence, False) return G ########################### #Actual training function!# ########################### def train(self, training_sentences, dev_sentences=None, prealloc=False): """Perceptron algorithm for learning edge weights If a dev set is provided, then we can evaluate the parser at every k iterations just so we know the progress of the training process and see if we need more iterations. Arg: a list of dictionaries in which you can put whatever in. Add your implementation """ #this is where you should populate the feature alphabet and the weight vector #cache training sentences; populate alphabets print "Populating features and caching training sentences ..." self._add_to_caches(training_sentences, 'training', not prealloc) print "Done!" #cache dev sentences if dev_sentences: print "Caching dev sentences ... " self._add_to_caches(dev_sentences, 'dev', False) print "Done!" #initialize weight vector if not prealloc: #don't touch this business if it's preallocated print "Initializing weight vector ..." self.weights = numpy.zeros(len(self.feature_alphabet)) #self.weights = numpy.zeros((len(self.feature_alphabet) + 1) * len(self.label_alphabet)) random.seed() for i, weight in enumerate(self.weights): self.weights[i] += .00001 print "Done!" #okay, start training, bro for i in xrange(self.num_iterations): print "Pass %d:\n" % (i + 1) if dev_sentences is not None and i % 2 == 0: # tracking progress print "Current UAS: %f" % self.evaluate(dev_sentences, 'dev') for j, sentence in enumerate(training_sentences): if not j % 1000: print "Training on sentences %d to %d of %d ..." % \ (j, min(j+999, len(training_sentences)), len(training_sentences)) #graph = self.make_fully_connected_graph(sentence) fcg = self.caches['training']['fcgs'][j] graph = self._featurized_to_weighted(fcg) max_spanning_tree = mst(0, graph) #Add training function here gold = self.caches['training']['counts'][j] hypo = self._get_counts(self._fcg_to_featurized(\ fcg, max_spanning_tree)) self._mutate_weights(gold, hypo) if self.decay: self.learning_rate *= 0.9 def evaluate(self, sentences, key): """Compute evaluation metrics Compute Unlabeled Arc Score (UAS) and optionally other metrics Add your implementation """ good = 0 total = 0 for j, sent in enumerate(sentences): fcg = self.caches[key]['fcgs'][j] graph = self._featurized_to_weighted(fcg) try: hypo = self._arcset(mst(0, graph)) except: #debug print '.', continue gold = set([(int(i), int(j)) for i,j,lab in sent['arcs']]) good += len(hypo.intersection(gold)) total += len(gold) return float(good)/ total def serialize(self, fname): """Convert to dictionary representation and serialize.""" d = {} d['weights'] = self.weights d['feat_alph'] = self.feature_alphabet.to_dict() d['label_alph'] = self.label_alphabet.to_dict() d['features'] = self.feature_generator_list d['decay'] = self.decay with open(fname, 'wb') as outf: cPickle.dump(d, outf) def deserialize(self, fname): """Retrieve from serialization; keep defaults where possible.""" with open(fname, 'rb') as inf: d = cPickle.load(inf) self.weights = d['weights'] self.feature_alphabet = Alphabet.from_dict(d['feat_alph']) self.label_alphabet = Alphabet.from_dict(d['label_alph']) self.feature_generator_list = d['features'] self.decay = d['decay'] def try_parse(self, inp): """Determine whether provided input is a file or a string.""" import nltk #was it a text file? try: inp = open(inp, 'rb').read() #nope! except IOError: pass paragraph = nltk.sent_tokenize(inp) for sentence in paragraph: self.parse(sentence) def parse(self, sentence_string): """Extra credit : parse an arbitrary string This is actually what we want at the end. Given an arbitrary string 0) split it into sentences (if you want to accept multiple sentences.) 1) tokenize 2) POS-tag and other pre-processing technique 3) parse it! 4) draw it using nltk draw_trees like in the example it does not support labeled arc though :( """ #draw a tree from nltk.draw.tree import draw_trees from nltk.tree import Tree import nltk words = nltk.pos_tag(nltk.word_tokenize(sentence_string)) sentence = {'tokens': ['ROOT'], 'arcs': [], 'pos':['ROOT']} for word, pos in words: sentence['tokens'].append(word) sentence['pos'].append(pos) indices = range(len(sentence['tokens'])) fcg = self.make_fully_connected_graph(sentence) weighted = self._featurized_to_weighted(fcg) max_spanning_tree = mst(0, weighted) wlist = sentence['tokens'] #print the dependencies for i in max_spanning_tree.keys(): for j in max_spanning_tree[i].keys(): print "%s->%s" % (i, j) t = self._build_tree(max_spanning_tree, 0, wlist) draw_trees(Tree(t)) ################################### #A whole bunch of helper functions# ################################### def _build_tree(self, G, root, wlist): if root in G.keys(): return '(' + str(wlist[root]) + ' '.join([self._build_tree(\ G, ind, wlist) for ind in G[root]]) + ')' else: return '(%s)' % str(wlist[root]) def _featurized_to_weighted(self, graph): """Converts a fully-connected graph to one with arc weights""" wG = {} for i in graph.keys(): #for j in graph[i].keys(): for j in graph.keys(): if i != j: arclength = -(numpy.sum(self.weights[graph[i][j][0]] \ * graph[i][j][1])) if not arclength: arclength = 1 if i in wG.keys(): wG[i][j] = arclength else: wG[i] = {j: arclength} return wG def _add_to_caches(self, sentence_set, key, grow_alph): """Add to the stored caches under a given key""" self.caches[key] = {'fcgs': [], 'counts': []} for sentence in sentence_set: self.caches[key]['counts'].append(\ self._get_counts(self._sentence_to_featurized(\ sentence, grow_alph))) for sentence in sentence_set: self.caches[key]['fcgs'].append(\ self.make_fully_connected_graph(sentence)) def _get_counts(self, graph): """Convert a graph into a dictionary of arc counts""" counts = {} for i in graph.keys(): for j in graph[i].keys(): for feat, weight in zip(*graph[i][j]): if feat in counts.keys(): counts[feat] += weight else: counts[feat] = weight return counts def _fcg_to_featurized(self, fully_connected, spanning_tree): """ Given a maximum spanning tree, retrieve the appropriate features from the fcg. """ feat_tree = {} for head in spanning_tree.keys(): for dep, weight in spanning_tree[head].iteritems(): if head in feat_tree.keys(): feat_tree[head][dep] = fully_connected[head][dep] else: feat_tree[head] = {dep: fully_connected[head][dep]} return feat_tree def _sentence_to_featurized(self, sentence, grow = True): """ Create a graph dictionary with feature vectors. """ #declare a graph (it's an empty dictionary) G = {} #featurize all arcs for src, dst, label in sentence['arcs']: features = self.featurize(int(src), int(dst), sentence, grow) try: G[int(src)][int(dst)] = features except: G[int(src)] = {int(dst): features} return G def _mutate_weights(self, gold, hypo): """ Change the weights by comparing a hypothesis with the gold standard. """ counts = {} #get set of all features involved; aggregate counts for elem in set(gold.keys()).union(set(hypo.keys())): counts[elem] = gold.get(elem, 0) - hypo.get(elem, 0) #adjust weights self.weights[counts.keys()] += \ numpy.array(counts.values()) * self.learning_rate def _arcset(self, G): return set([(i, j) for i in G.keys() for j in G[i].keys()])
class Naive_Bayes(object): """""" def __init__(self, data, feature_function): """ Takes a dictionary mapping labels to lists of strings with that label, and a function which produces a list of feature values from a string. """ # your code here! self.data = data self.feature_codebook = Alphabet() # self.word_dict = Alphabet() self.label_codebook = Alphabet() self.feature_function = feature_function # def _build_instance_list(self): # """""" # instance_list = {} # for label, documents in self.data.items(): # instance_list[label] = [] # for doc in documents: # vector = self.extract_feature(self.data, doc, s) # instance_list[label].append(vector) # self.instance_list = instance_list # # def _populate_codebook(self): # """""" # for label in self.instance_list: # self.label_codebook.add(label) # #here we use all the word set as features # self.feature_codebook = copy.deepcopy(self.word_dict) def extract_feature(self, string): """""" vector = np.zeros(self.feature_codebook.size()) tokens = set(nltk.regexp_tokenize(string, pattern="\w+")) indice = 0 for word in tokens: if self.feature_codebook.has_label(word): indice = self.feature_codebook.get_index(word) vector[indice] = 1.0 return vector def _collect_counts(self): """""" self.count_table = np.zeros((self.feature_codebook.size(), self.label_codebook.size())) self.count_y_table = np.zeros(self.label_codebook.size()) for label, docs in self.instance_list.items(): Y_index = self.label_codebook.get_index(label) for vector in docs: self.count_y_table[Y_index] += 1.0 self.count_table[:, Y_index] += vector # for sparse vector we use different counting method # for x in vector: # self.count_table[x,Y_index] += 1.0 def train(self, theta): """""" self.instance_list = self.feature_function(self.data, self.label_codebook, self.feature_codebook, theta) # self._populate_codebook_withSelectFeature() # self.instance_list = self.feature_function(self.data, self.label_codebook, self.feature_codebook, select_feature) self._collect_counts() self.p_x_given_y_table = np.zeros((self.feature_codebook.size(), self.label_codebook.size())) self.p_y_table = np.zeros(self.label_codebook.size()) self.p_x_given_y_table = (self.count_table + 0.2) / (self.count_y_table + self.feature_codebook.size() * 0.2) self.p_y_table = self.count_y_table / self.count_y_table.sum() def compute_log_unnormalized_score(self, feature_vector): """Compute log P(X|Y) + log P(Y) for all values of Y Returns a vector of loglikelihood. loglikelihood_vector[0] = log P(X|Y=0) + log P(Y=0) """ loglikelihood_vector = np.zeros(self.label_codebook.size()) for label in range(0, self.label_codebook.size()): logpro = math.log(self.p_y_table[label]) for feature_index in range(0, self.feature_codebook.size()): logpro += feature_vector[feature_index] * math.log(self.p_x_given_y_table[feature_index, label]) + (1 - feature_vector[feature_index]) * math.log(1 - self.p_x_given_y_table[feature_index, label]) loglikelihood_vector[label] = logpro return loglikelihood_vector def classify(self, string): """ Classifies a string according to the feature function and training data provided at initialization. Predict the label of the given instance return the predict label for the input document """ # your code here! feature_vector = self.extract_feature(string) logvector = self.compute_log_unnormalized_score(feature_vector) # print vector pre_label_index = np.argmax(logvector) return self.label_codebook.get_label(pre_label_index)
class MaxEnt(BaseClassifier): def __init__(self): """Initialize the model label_codebook, feature_codebook, parameters must be assigned properly in order for the model to work. parameters and codebooks will be handled in the train function """ super(MaxEnt, self).__init__() self.label_codebook = Alphabet() self.feature_codebook = Alphabet() #self.gaussian_prior_variance = 1 self.parameters = [] self.gaussian_prior_variance = 1.0 def compute_observed_counts(self, instance_list): """Compute observed feature counts It should only be done once because it's parameter-independent. The observed feature counts are then stored internally. Note that we are fitting the model with the intercept terms so the count of intercept term is the count of that class. fill the feature_counts table with observed counts """ #the data and label in instance both use sparse vector self.feature_counts = numpy.zeros((self.feature_codebook.size() + 1) * self.label_codebook.size()) for instance in instance_list: Y_index = (self.feature_codebook.size()+1)*instance.label self.feature_counts[Y_index] +=1 #instance.data is numpy array indices = Y_index + instance.data +1 self.feature_counts[indices] +=1 #print self.feature_counts[:self.feature_codebook.size()+1] #print self.feature_counts[self.feature_codebook.size()+1:] def compute_expected_feature_counts(self,instance_list): """Compute expected feature counts E(feature|X) = sum over i,y E(feature(Xi,yi)|Xi) = sum over i,y feature(Xi,yi) P(Y=yi|Xi) We take advantage of inference function in this class to compute expected feature counts, which is only needed for training. computing the expected feature counts by adding up all the expectation counts of all feature. return expected feature counts table """ expected_feature_counts = numpy.zeros(len(self.parameters)) for instance in instance_list: posterior = self.compute_label_unnormalized_loglikelihood_vector(instance.data) posterior = numpy.exp(posterior-logsumexp(posterior)) for label in range(0,self.label_codebook.size()): Y_index = label*(self.feature_codebook.size() + 1) expected_feature_counts[Y_index] += posterior[label] indices = Y_index + instance.data + 1 expected_feature_counts[indices] += posterior[label] return expected_feature_counts def classify_instance(self, instance): """Applying the model to a new instance Returns: label with the maximum probability """ vector = self.compute_posterior_distribution(instance) #print vector pre_label_index = numpy.argmax(vector) return pre_label_index def compute_posterior_distribution(self, instance): """Compute P(Y|X) Return a vector of the same size as the label_codebook the vector contains the unnormalized likelihood vector since we only use them for finding the most probable label, so we don't have to normalized it. """ sparse_vector = numpy.array([self.feature_codebook.get_index(i) for i in instance.data if self.feature_codebook.has_label(i)]) posterior_distribution = numpy.zeros(self.label_codebook.size()) posterior_distribution = numpy.exp(self.compute_label_unnormalized_loglikelihood_vector(sparse_vector)) return posterior_distribution def compute_label_unnormalized_loglikelihood_vector(self,sparse_feature_vector): """Compute unnormalized log score from log-linear model log P(Y|X) is proportional to feature vector * parameter vector But we use a sparse vector representation, so we need to use index tricks that numpy allows us to do. for each label compute the unnormalized loglikelihood (sum of lambdas) given the sparse_feature_vector Returns: a vector of scores according to different y(label) """ loglikelihood_score_vector = numpy.zeros(self.label_codebook.size()) for label in range(0,self.label_codebook.size()): Y_index = label*(self.feature_codebook.size() + 1) indices = Y_index + sparse_feature_vector + 1 if len(indices)!=0: loglikelihood_score_vector[label] = self.parameters[Y_index] + sum(self.parameters[indices]) else: loglikelihood_score_vector[label] = self.parameters[Y_index] return loglikelihood_score_vector def objective_function(self, parameters): """Compute negative (log P(Y|X,lambdas) + log P(lambdas)) The function that we want to optimize over. Here I use Gaussian distribution(mean=0.0 sigma=1.0) prior to model P(lambda) Args: parameters updated by the training procedure Returns: negtive total likelihood """ total_loglikelihood = 0.0 numerator = 0.0 denominator = 0.0 #prior = 0.0 #self.gaussian_prior_variance = 1.0 prior = sum([i**2/(2*self.gaussian_prior_variance**2) for i in parameters]) self.parameters=numpy.array(parameters) # Compute the loglikelihood here loglikelihood_score_vector = numpy.zeros(self.label_codebook.size()) for instance in self.training_data: Y_index = instance.label*(self.feature_codebook.size() + 1) indices = Y_index + instance.data + 1 numerator += (parameters[Y_index]+sum(parameters[indices])) score_vector = self.compute_label_unnormalized_loglikelihood_vector(instance.data) #print score_vector denominator += logsumexp(score_vector) #print numerator #print denominator total_loglikelihood = numerator - denominator - prior print - total_loglikelihood return - total_loglikelihood def gradient_function(self, parameters): """Compute gradient of negative (log P(Y|X,lambdas) + log P(lambdas)) wrt lambdas With some algebra, we have that gradient wrt lambda i = observed_count of feature i - expected_count of feature i - lambda i / gaussian_prior_variance^2 The first term is computed before running the optimization function and is a constant. The second term needs inference to get P(Y|X, lambdas) and is a bit expensive. The third term is from taking the derivative of log gaussian prior Returns: a vector of gradient """ self.parameters = numpy.array(parameters) #print self.parameters #print parameters gradient_vector = numpy.zeros(len(parameters)) observed_count_vector = self.feature_counts expected_count_vector = self.compute_expected_feature_counts(self.training_data) dprior = numpy.array([i/self.gaussian_prior_variance**2 for i in parameters]) # compute gradient here gradient_vector = observed_count_vector - expected_count_vector - dprior return - gradient_vector def train(self, instance_list): """Find the optimal parameters for maximum entropy classifier We setup an instance of MaxEnt to use as an inference engine necessary for parameter fitting. MaxEnt instance and training set are stored internally in the trainer just so we can avoid putting in extra arguments into the optimization function. We leave the actual number crunching and search to fmin_bfgs function. There are a few tunable parameters for the optimization function but the default is usually well-tuned and sufficient for most purposes. Arg: instance_list: each instance.data should be a string feature vectors This function will create a sparse feature vector representation based on the alphabet. Returns: Maximum entropy classifier with the parameters (MAP estimate from the data and Gaussian prior) """ assert(len(instance_list) > 0) ###################################### # Do any further processing right here e.g populate codebook # making sparse vectors, etc. self.label_codebook.add('neg') self.label_codebook.add('pos') for index,instance in enumerate(instance_list): sparse_vector = numpy.zeros(0,dtype=numpy.int) for feature in instance.data: if not self.feature_codebook.has_label(feature): self.feature_codebook.add(feature) sparse_vector = numpy.append(sparse_vector,self.feature_codebook.get_index(feature)) else: sparse_vector = numpy.append(sparse_vector,self.feature_codebook.get_index(feature)) instance_list[index].data = sparse_vector ################## self.parameters = numpy.zeros((self.feature_codebook.size() + 1) * self.label_codebook.size()) self.training_data = instance_list self.compute_observed_counts(instance_list) num_labels = self.label_codebook.size() num_features = self.feature_codebook.size() init_point = numpy.zeros(num_labels * (num_features + 1)) optimal_parameters, _, _ = fmin_l_bfgs_b(self.objective_function, init_point, fprime=self.gradient_function) print optimal_parameters self.parameters = optimal_parameters def to_dict(self): model_dict = { 'label_alphabet': self.label_codebook.to_dict(), 'feature_alphabet': self.feature_codebook.to_dict(), 'parameters': self.parameters.tolist(), } return model_dict @classmethod def from_dict(cls, model_dictionary): model_instance = MaxEnt() model_instance.label_codebook = Alphabet.from_dict(model_dict['label_alphabet']) model_instance.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet']) model_instance.p_x_given_y_table = numpy.array(model_dict['parameters']) return model_instance