def _prob_classify(self, input): # Make a featureset of the input after tokenizing tokenized input_tokenized_featureset = self._tokenizeInputToFeatures( input).copy() # Ensuring that all the feature names are valid and can be ued for input_feature_name in input_tokenized_featureset.keys(): for label in self._labels: if (label, input_feature_name ) in self._featureProbabilityDistribution: break else: #print 'Ignoring unseen feature %s' % input_feature_name del input_tokenized_featureset[input_feature_name] # Start with a log probability of 0 to avoid skewing towards larger data sets logprob = {} for label in self._labels: #print "in here adding labels" logprob[label] = 0 # Add in the log probability of features given labels. # Iterate through the labels assigned eg : location,time, noise for label in self._labels: # Iterate through the input feature set one by one eg "{turkey:true, bacon:true}" for (input_feature_name, input_feature_val) in input_tokenized_featureset.items(): # If the combination ie (location,turkey) belongs in the trainig set, add the log probability if (label, input_feature_name ) in self._featureProbabilityDistribution: # Assign its probability feature_probs = self._featureProbabilityDistribution[ label, input_feature_name] logprob[label] += feature_probs.logprob(input_feature_val) else: # nb: This case will never come up if the classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. dictprobDist = DictionaryProbDist(logprob, normalize=True, log=True) return dictprobDist
def get_class_probs(self, token): feature_vector = token[self.property('FEATURE_VECTOR')] if len(feature_vector) * len(self._classes) != len(self._weights): raise ValueError, 'Bad feature vector length' prob_dict = {} for i, cls in enumerate(self._classes): # Find the offset into the weights vector. offset = i * len(feature_vector) # Multiply the weights of all active features for this class. prod = 1.0 for (id, val) in feature_vector.assignments(): prod *= (self._weights[id + offset]**val) prob_dict[cls] = prod # Normalize the dictionary to give a probability distribution return DictionaryProbDist(prob_dict, normalize=True)
def prob_classify(self, featureset): """ Return a probability distribution of classifications :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance """ if self._model is None: raise Exception('This classifier is not yet trained') return None # do the classification prediction = self._get_svm_classification(featureset) if self._verbose: print('prediction', prediction) # lump it into a boolean class, -1 or +1 predicted_label = cmp(prediction, 0) # sometimes the result is not within -1 ... +1; clip it so # that it is, and we get a sane-looking probability # distribution. this will upset some results with non-linear # partitioning where instance-hyperplane distance can be many # orders of magnitude larger; I don't have a fix for that if prediction < -1.0: prediction = -1.0 if prediction > 1.0: prediction = 1.0 # if the prediction is negative, then we will maximise the # value of the -1 class; otherwise, that of the 1 class will # be greater. if predicted_label == 1: distribution = { str(self.resolve_prediction(1)): prediction, str(self.resolve_prediction(-1)): 1 - prediction } else: distribution = { str(self.resolve_prediction(1)): prediction + 1, str(self.resolve_prediction(-1)): -prediction } return DictionaryProbDist(distribution)
def parse_sausage(fname): """gets the filename of a sausage and returns a list of probability distributions""" sausage = [] with open(fname) as f: for line in f: if line.startswith('align'): # align a w1 p1 w2 p2 ... # split line and ignore first two tokens bits = line.split()[2:] dist = DictionaryProbDist( {w: float(p) for w, p in zip(bits[::2], bits[1::2])}) sausage.append(dist) # remove sentence boundaries assert sausage[0].samples() == ['<s>'] assert sausage[-1].samples() == ['</s>'] sausage = sausage[1:-1] return sausage
def __init__(self, label_dist, emission_dist, label_dom, emission_dom, \ mutable=False): """ @type label_dist: nltk prob dist @param label_dist: transition distribution @type emission_dist: nltk prob dist @param emission_dist: emission distribution @type label_dom: list @param label_dom: state domain @type emission_dom: list @param emission_dom: emission domain @type mutable: bool @param mutable: if true, the distributions stored will be mutable dictionary distributions, so the model can be updated """ self.order = 2 self.label_dom = label_dom self.num_labels = len(label_dom) self.emission_dom = emission_dom self.num_emissions = len(emission_dom) self.label_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\ label_dist, mutable=mutable) self.emission_dist = cond_prob_dist_to_dictionary_cond_prob_dist(\ emission_dist, mutable=mutable) # Marginalize the emission dist to get an unconditioned version observations = {} for label in emission_dist.conditions(): for samp in emission_dist[label].samples(): observations[samp] = observations.get(samp, 0.0) + \ emission_dist[label].prob(samp) self.observation_dist = DictionaryProbDist(observations) self.seen_labels = label_dom self.backoff_model = None # Initialize the various caches # These will be filled as we access probabilities self.clear_cache()
def prob_classify(self, feat): '''Return ProbDistI of averaged label probabilities.''' label_probs = collections.defaultdict(list) for classifier in self._classifiers: try: cprobs = classifier.prob_classify(feat) for label in cprobs.samples(): label_probs[label].append(cprobs.prob(label)) except NotImplementedError: # if we can't do prob_classify (like for DecisionTree) # assume 100% probability from classify label_probs[classifier.classify(feat)].append(1) avg_probs = {} for label, probs in label_probs.items(): avg_probs[label] = float(sum(probs)) / len(probs) return DictionaryProbDist(avg_probs)
def prob_classify(self, featureset): featureset = featureset.copy() for fname in list(featureset.keys()): for label in self._labels: if (label, fname) in self._feature_probdist: break else: del featureset[fname] logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label, fname] logprob[label] += feature_probs.logprob(fval) else: logprob[label] += sum_logs([]) return DictionaryProbDist(logprob, normalize=True, log=True)
def prob_classify(self, featureset, priors=None): # Discard any feature names that we've never seen before. # Otherwise, we'll just assign a probability of 0 to # everything. featureset = featureset.copy() for fname in featureset.keys(): for label in self._labels: if (label, fname) in self._feature_probdist: break else: #print 'Ignoring unseen feature %s' % fname del featureset[fname] # Find the log probabilty of each label, given the features. # Start with the log probability of the label itself. logprob = {} if priors == None: print "no prior", for label in self._labels: logprob[label] = self._label_probdist.logprob(label) else: for label, prob in priors.iteritems(): logprob[label] = math.log(prob, 2) # Then add in the log probability of features given labels. for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label,fname] logprob[label] += feature_probs.logprob(fval) else: # nb: This case will never come up if the # classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. return DictionaryProbDist(logprob, normalize=True, log=True)
skwWords = [w for w in akwWords if not w in kwstopWords] kwDict['NoStop'] = len(skwWords) ## MAYBE USE THIS? # remove small words # elected not to use this finder2.apply_ngram_filter(lambda w1, w2: len(w1) < 2) scored = finder2.score_ngrams(bigram_measures.raw_freq) for bscore in scored[:20]: print(bscore) # need to stem, but realy only want to stem "horse" and "horses" # First list the top 50 words by frequency (normalized by the length of the document) bbDist = FreqDist(sbbWords) bbDist2 = DictionaryProbDist(bbDist, normalize=True) bbDist2.prob('black') bbDist2.prob('horse') bbDist.plot(50) # need to make second number number / len(sbbWords) bbItems = bbDist.most_common(50) # Show the normalized probability for item in bbItems: print(item) # King of the Wind Frequency Distribution kwDist = FreqDist(skwWords) kwDist2 = DictionaryProbDist(kwDist, normalize=True) kwDist2.prob('said') kwDist2.prob('agba') kwDist.plot(50)
def _make_probdist(self, y_proba): classes = self._encoder.classes_ return DictionaryProbDist(dict((classes[i], p) for i, p in enumerate(y_proba)))
def pd(values, samples): d = dict(zip(samples, values)) return DictionaryProbDist(d)
print(nb_classifier.classify(posfeat)) print(accuracy(nb_classifier, test_feats)) probs = nb_classifier.prob_classify(test_feats[0][0]) print(probs.samples()) print(probs.max()) print(probs.prob('pos')) print(probs.prob('neg')) print(nb_classifier.most_informative_features(n=5)) print("############################################################################") print(nb_classifier.show_most_informative_features(n=5)) print("############################################################################") nb_classifier = NaiveBayesClassifier.train(train_feats, estimator=LaplaceProbDist) print("Accuracy: " + str(accuracy(nb_classifier, test_feats))) # Accuracy: 0.76 label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5}) true_probdist = DictionaryProbDist({True: 1}) feature_probdist = {('pos', 'yes'): true_probdist, ('neg', 'no'): true_probdist} classifier = NaiveBayesClassifier(label_probdist, feature_probdist) print(classifier.classify({'yes': True})) print(classifier.classify({'no': True}))
def parse_weka_distribution(self, s): probs = [float(v) for v in re.split('[*,]+', s) if v.strip()] probs = dict(zip(self._formatter.labels(), probs)) return DictionaryProbDist(probs)
def train(positive_featuresets, unlabeled_featuresets, positive_prob_prior=0.5, estimator=ELEProbDist): """ :param positive_featuresets: A list of featuresets that are known as positive examples (i.e., their label is ``True``). :param unlabeled_featuresets: A list of featuresets whose label is unknown. :param positive_prob_prior: A prior estimate of the probability of the label ``True`` (default 0.5). """ positive_feature_freqdist = defaultdict(FreqDist) unlabeled_feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occurred in positive examples. for featureset in positive_featuresets: for fname, fval in featureset.items(): positive_feature_freqdist[fname].inc(fval) feature_values[fname].add(fval) fnames.add(fname) # Count up how many times each feature value occurred in unlabeled examples. for featureset in unlabeled_featuresets: for fname, fval in featureset.items(): unlabeled_feature_freqdist[fname].inc(fval) feature_values[fname].add(fval) fnames.add(fname) # If a feature didn't have a value given for an instance, then we assume that # it gets the implicit value 'None'. num_positive_examples = len(positive_featuresets) for fname in fnames: count = positive_feature_freqdist[fname].N() positive_feature_freqdist[fname].inc(None, num_positive_examples - count) feature_values[fname].add(None) num_unlabeled_examples = len(unlabeled_featuresets) for fname in fnames: count = unlabeled_feature_freqdist[fname].N() unlabeled_feature_freqdist[fname].inc( None, num_unlabeled_examples - count) feature_values[fname].add(None) negative_prob_prior = 1.0 - positive_prob_prior # Create the P(label) distribution. label_probdist = DictionaryProbDist({ True: positive_prob_prior, False: negative_prob_prior }) # Create the P(fval|label, fname) distribution. feature_probdist = {} for fname, freqdist in positive_feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[True, fname] = probdist for fname, freqdist in unlabeled_feature_freqdist.items(): global_probdist = estimator(freqdist, bins=len(feature_values[fname])) negative_feature_probs = {} for fval in feature_values[fname]: prob = (global_probdist.prob(fval) - positive_prob_prior * feature_probdist[True, fname].prob(fval)) \ / negative_prob_prior # TODO: We need to add some kind of smoothing here, instead of # setting negative probabilities to zero and normalizing. negative_feature_probs[fval] = max(prob, 0.0) feature_probdist[False, fname] = DictionaryProbDist( negative_feature_probs, normalize=True) return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
def pd(values, samples): d = {} for value, item in zip(values, samples): d[item] = value return DictionaryProbDist(d)
def initialize_chord_types(cls, probs, model_name="default", chord_set="scale+dom7"): """ Creates a new model with the distributions initialized naively to favour simple chord-types, as R&S do in the paper. They don't say what values they use for C{probs}, except that they're high, medium and low respectively. The transition distribution is initialized so that everything is equiprobable. @type probs: 3-tuple of floats @param probs: probability mass to assign to (0.) chord notes, (1.) scale notes and (2.) other notes. The three values should sum to 1.0 (but will be normalized to if they don't) """ prob_sum = sum(probs) probs = [p / prob_sum for p in probs] # Create a probability distribution for the emission # distribution dists = {} # Create the distribution for each possible r-value for r in range(4): probabilities = {} for d in [0, 1, 2]: probabilities[d] = probs[0] / 3.0 probabilities[3] = probs[1] probabilities[4] = probs[2] dists[r] = DictionaryProbDist(probabilities) emission_dist = DictionaryConditionalProbDist(dists) # These distributions will make everything equiprobable key_transition_counts = ConditionalFreqDist() chord_transition_counts = ConditionalFreqDist() chord_counts = {} # Get all possible labels label_dom = cls.get_label_dom(chord_set=chord_set) for label0 in label_dom: for label1 in label_dom: key, pkey = states_to_key_transition(label1, label0) # Give one count to the key transition corresponding to this state transition key_transition_counts[pkey].inc(key) # And one to the chord transition corresponding to this state transition if label0[0] == label1[0] and label0[1] == label1[1]: # tonic = tonic', mode = mode' chord_transition_counts[label0[2]].inc(label1[2]) else: chord_counts.setdefault(label1[2], 0) chord_counts[label1[2]] += 1 # Estimate distributions from these frequency distributions key_dist = ConditionalProbDist(key_transition_counts, mle_estimator, None) chord_trans_dist = ConditionalProbDist(chord_transition_counts, mle_estimator, None) chord_dist = DictionaryProbDist(chord_counts) # Sample these to get dictionary prob dists key_dist = cond_prob_dist_to_dictionary_cond_prob_dist(key_dist) chord_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist( chord_trans_dist) chord_dist = prob_dist_to_dictionary_prob_dist(chord_dist) model = cls(key_dist, \ chord_trans_dist, \ emission_dist, \ chord_dist, \ model_name=model_name, chord_set=chord_set) model.add_history(\ "Initialized model '%s' to chord type probabilities, using "\ "parameters: %s, %s, %s" % (model_name, probs[0], probs[1], probs[2])) return model
def setUp(self): """ Prepare some training data. """ self.TRAINING_DATA = [[ 0, 5, 5, 7, 6, 7, 8, 5, 2, 0, 3, 1, 2, 2, 2, 9, 9, 8, 0, 8, 9, 9, 1, 3, 2, 2, 1 ], [3, 3, 1, 2, 1, 1, 0, 1, 9, 7, 8, 7, 7, 9, 0], [ 7, 8, 6, 9, 8, 9, 9, 1, 3, 0, 1, 3, 0, 1, 1, 0, 5, 7, 5, 4, 5, 7, 7 ]] self.TEST_DATA = [0, 1, 2, 3, 4, 3, 5, 6, 7, 8, 8, 9, 7, 7, 0, 0, 1] ems = list(range(10)) states = ['H', 'M', 'L'] # Construct some initial distributions # Emission hprobs = { 0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.1, 7: 0.3, 8: 0.3, 9: 0.3 } mprobs = { 0: 0.0, 1: 0.0, 2: 0.0, 3: 0.1, 4: 0.3, 5: 0.3, 6: 0.3, 7: 0.0, 8: 0.0, 9: 0.0 } lprobs = { 0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0 } conddist = { 'H': DictionaryProbDist(hprobs), 'M': DictionaryProbDist(mprobs), 'L': DictionaryProbDist(lprobs), } emdist = DictionaryConditionalProbDist(conddist) # And transition conddist = {} for first in states + [None]: probs = dict([(second, 1.0 / 3) for second in states + [None]]) dist = DictionaryProbDist(probs) conddist[(first, )] = dist transdist = DictionaryConditionalProbDist(conddist) # Initialize an ngram model with these distributions self.model = DictionaryHmmModel(transdist, emdist, states, ems)
class NovelParagraph: def __init__(self, *args, **kwargs): if 'strategy' in kwargs: self.strategy = kwargs['strategy'] else: self.strategy = 'best' self.events = [] self.sentences = [] self.source_probability = {} self.querysets = {} self.sources = [] self.symmetrical_tokens = [] for source, probability in args: self.source_probability[source] = probability self.querysets[source] = NGram.objects.filter( **reconcile_old_style_source(source) ) self.sources.append(source) if self.querysets[source].count() == 0: raise InvalidSourceException("No NGrams with this source") self.source_probability = DictionaryProbDist(self.source_probability) def pick_queryset(self): return self.querysets[self.source_probability.generate()] def append_sentence(self): self.current_sentence = [] starter = self.pick_queryset().filter( sentence_starter=True ).order_by('?').first() self.current_sentence.append((starter.token_one, starter.tag_one)) self.current_sentence.append((starter.token_two, starter.tag_two)) self.current_sentence.append((starter.token_three, starter.tag_three)) while self.current_sentence[-1][0] not in TERMINAL_PUNCTUATION: new_word = self.new_word() self.current_sentence.append(new_word) self.sentences.append(self.current_sentence) def _get_others(self, original): sources = self.sources.copy() sources.remove(original) return [ NGram.objects.filter( **reconcile_old_style_source(source) ) for source in sources ] def _account_for_symmetrical_tokens(self, token): if token in SYMMETRICAL_TOKENS: self.symmetrical_tokens.append( ( SYMMETRICAL_TOKENS[token], SYMMETRICAL_TOKENS[token] ) ) def new_word(self): queryset = self.pick_queryset() ordered_querysets = [queryset] if len(self.sources) > 1: if queryset.first().twitter_user: source = queryset.first().twitter_user.twitter_id + '@twitter' else: source = 'document:'+queryset.first().document.name ordered_querysets = ordered_querysets + self._get_others(source) for qs in ordered_querysets: new_word = self.new_word_from_queryset(qs) if new_word: self._account_for_symmetrical_tokens(new_word[0]) if new_word[0] in TERMINAL_PUNCTUATION: if len(self.symmetrical_tokens) > 0: return self.symmetrical_tokens.pop() return new_word if len(self.symmetrical_tokens) > 0: return self.symmetrical_tokens.pop() return ('.', '.') def _best_matching_word(self, queryset): if self.strategy == 'grammar_only': return queryset.filter( tag_one=self.current_sentence[-2][1], tag_two=self.current_sentence[-1][1], ).order_by('?').first() else: nxt = queryset.filter( token_one__iexact=self.current_sentence[-2][0], token_two__iexact=self.current_sentence[-1][0], tag_one=self.current_sentence[-2][1], tag_two=self.current_sentence[-1][1], ).order_by('?').first() if not nxt: nxt = queryset.filter( token_one__iexact=self.current_sentence[-2][0], token_two__iexact=self.current_sentence[-1][0], ).order_by('?').first() return nxt def new_word_from_queryset(self, queryset): nxt = self._best_matching_word(queryset) if nxt: return (nxt.token_three, nxt.tag_three) else: return None @classmethod def _needs_space(self, token, previous_token, index): if index == 0: return False if previous_token in NO_TRAILING_SPACE_TOKENS: return False if token in NO_LEADING_SPACE_TOKENS: return False return True @classmethod def _join_and_postprocess_sentences(self, sentences): sentences = [''.join(sentence) for sentence in sentences] text = ' '.join(sentences) for pattern, replacement in REGEX_REPLACEMENTS: text = re.sub(pattern, replacement, text) return text def human_readable_sentences(self): final_output = [] for sent in self.sentences: output = [] for i, token in enumerate(sent): if NovelParagraph._needs_space(token[0], sent[i-1][0], i): output.append(' ') output.append(token[0]) final_output.append(output) return NovelParagraph._join_and_postprocess_sentences(final_output)
def set_chord_transition_probabilities(self, spec): """ Sets the parameters of the chord transition distribution. This is used in initialization. The parameters are extracted from a string: this is so that it can be specified in a script option. The required format of the string is a comma-separated list of parameters given as C0->C1-P, where C0 and C1 are chords (I, II, etc) that are in the model's distribution and P is a float probability. Parameters not specified will be evenly distributed the remaining probability mass. """ params = {} param_re = re.compile(r'(?P<chord0>.+)->(?P<chord1>.+)-(?P<prob>.+)') chord_ids = dict( (name, num) for (num, name) in constants.CHORD_NAMES.items()) def _chord_id(name): # Get the id for the named chord if name not in chord_ids: raise RaphstoHmmParameterError, "unrecognised chord name '%s' "\ "in parameter spec: %s" % (name,spec) cid = chord_ids[name] if cid not in self.chord_transition_dom: raise RaphstoHmmParameterError, "chord %s is not used with this "\ "model (in parameter spec: %s)" % (name,spec) return cid for param_str in spec.split(","): # Pull out the bits of the parameter specification match = param_re.match(param_str.strip()) if not match: raise RaphstoHmmParameterError, "could not parse parameter "\ "spec: %s (in: %s)" % (param_str, spec) parts = match.groupdict() chord0 = _chord_id(parts['chord0']) chord1 = _chord_id(parts['chord1']) try: prob = float(parts['prob']) except ValueError: raise RaphstoHmmParameterError, "not a valid probability: %s "\ "(in %s)" % (parts['prob'], spec) # Store the parameter value params.setdefault(chord0, {})[chord1] = prob # Set the values in the transition distribution dists = {} for chord0 in self.chord_transition_dom: dist_params = {} if chord0 not in params: # Not given in the spec: uniform distribution uniform_mass = 1.0 / len(self.chord_transition_dom) for chord1 in self.chord_transition_dom: dist_params[chord1] = uniform_mass else: # Work out the prob mass to be distributed among unspecified parameters not_given = len(self.chord_transition_dom) - len( params[chord0]) if not_given > 0: given_mass = sum(params[chord0].values(), 0.0) uniform_mass = (1.0 - given_mass) / not_given else: uniform_mass = 0.0 # Calculate the whole distribution for chord1 in self.chord_transition_dom: if chord1 in params[chord0]: dist_params[chord1] = params[chord0][chord1] else: dist_params[chord1] = uniform_mass dists[chord0] = DictionaryProbDist(dist_params) # Use this distribution instead of what's already there self.chord_transition_dist = DictionaryConditionalProbDist(dists) self.add_history("Set chord transition distribution using "\ "parameters: %s" % spec)
sep="\t", quoting=csv.QUOTE_NONE, header=None) hmm_emits_pd = hmm_emits_pd.set_index([0, 1]) hmm_trans_pd = hmm_trans_pd.set_index([0, 1]) hmm_trans_pd = hmm_trans_pd.apply(lambda x: np.exp(x)) hmm_emits_pd = hmm_emits_pd.apply(lambda x: np.exp(x)) tag_dict_tag = dict() for tag in distinct_tags: tag_dict = dict( zip(hmm_trans_pd.ix[tag].index, hmm_trans_pd.ix[tag].values.ravel())) #missing_to_dict = list(set(distinct_tags).difference(tag_dict.keys())) #tag_dict.update(zip(missing_to_dict,np.zeros(len(missing_to_dict)))) tag_dict_tag[tag] = DictionaryProbDist(tag_dict) transition = DictionaryConditionalProbDist(tag_dict_tag) tag_dict_word = dict() for tag in distinct_tags: tag_dict = dict( zip(hmm_emits_pd.ix[tag].index, hmm_emits_pd.ix[tag].values.ravel())) #missing_to_dict = list(set(distinct_tags).difference(tag_dict_word.keys())) #tag_dict_word.update(zip(missing_to_dict,np.zeros(len(missing_to_dict)))) tag_dict_word[tag] = DictionaryProbDist(tag_dict) emission = DictionaryConditionalProbDist(tag_dict_word) def get_value(df, index_1, index_2):
def _make_probdist(self, y_proba): return DictionaryProbDist(dict((self._index_label[i], p) for i, p in enumerate(y_proba)))
weights = memcache.get_multi(allkeys, namespace=cache_ver, key_prefix=cache_ver) for label in labels: feature_vector = [] for i in groups[label]: if i in weights: # ?? maybe get_multi didnt return feature_vector.append((weights[i], 1)) if alwayson and label in alwayson: feature_vector.append((alwayson[label], 1)) total = 0.0 for (weight, f_val) in feature_vector: total += weight * f_val prob_dict[label] = total # Normalize the dictionary to give a probability distribution return DictionaryProbDist(prob_dict, log=True, normalize=True) def encode(featureset, label, alwayson): # Inherit docs. encoding = [] # Convert input-features to joint-features: keys = [(fname + str(fval) + label) for fname, fval in featureset.items()] # Known feature name & value: weights = memcache.get_multi(keys, namespace=cache_ver, key_prefix=cache_ver) for i in weights: encoding.append((weights[i], 1)) # Add always-on features: if alwayson and label in alwayson:
def initialize_chord_classes(cls, tetrad_prob, max_notes, grammar, \ illegal_transitions=[], fixed_root_transitions={}, metric=False): """ Creates a new model with the distributions initialized naively to favour simple chord-types, in a similar way to what R&S do in the paper. The transition distribution is initialized so that everything is equiprobable. @type tetrad_prob: float @param tetrad_prob: prob of a note in the tetrad. This prob is distributed over the notes of the tetrad. The remaining prob mass is distributed over the remaining notes. You'll want this to be >0.33, so that tetrad notes are more probable than others. @type max_notes: int @param max_notes: maximum number of notes that can be generated in each emission. Usually best to set to something high, like 100 - it's just to make the distribution finite. @type grammar: L{jazzparser.grammar.Grammar} @param grammar: grammar from which to take the chord class definitions @type metric: bool @param metric: if True, creates a model with a metrical component (dependence on metrical position). Default False """ # Only use chord classes that are used by some morph item in the lexicon classes = [ccls for ccls in grammar.chord_classes.values() if ccls.used] # Create a probability distribution for the emission distribution dists = {} # Create the distribution for each possible r-value if we're creating # a metrical model if metric: r_vals = range(4) else: r_vals = [0] # Separate emission distribution for each chord class for ccls in classes: for r in r_vals: probabilities = {} # We assign two different probabilities: in tetrad or out # Don't assume the tetrad has 4 notes! in_tetrad_prob = tetrad_prob / len(ccls.notes) out_tetrad_prob = (1.0 - tetrad_prob) / (12 - len(ccls.notes)) # Give a probability to every pitch class for d in range(12): if d in ccls.notes: probabilities[d] = in_tetrad_prob else: probabilities[d] = out_tetrad_prob dists[(ccls.name,r)] = DictionaryProbDist(probabilities) emission_dist = DictionaryConditionalProbDist(dists) # Take the state labels from the lexical entries in the grammar # Include only tonic categories that were generated from lexical # expansion rules - i.e. only tonic repetition categories schemata = grammar.midi_families.keys() # Check that the transition constraint specifications refer to existing # schemata for labels in illegal_transitions: for label in labels: if label not in schemata: raise ValueError, "%s, given in illegal transition "\ "specification, is not a valid schema in the grammar" \ % label for labels in fixed_root_transitions: for label in labels: if label not in schemata: raise ValueError, "%s, given in fixed root transition "\ "specification, is not a valid schema in the grammar" \ % label # Build from the grammar a mapping from lexical schemata (POSs) to # chord classes chord_class_mapping = {} for morph in grammar.morphs: if morph.pos in schemata: chord_class_mapping.setdefault(morph.pos, []).append(str(morph.chord_class.name)) # Make sure that every label appears in the mapping for label in schemata: if label not in chord_class_mapping: chord_class_mapping[label] = [] # Initialize transition distribution so every transition is equiprobable schema_transition_counts = ConditionalFreqDist() root_transition_counts = ConditionalFreqDist() for label0 in schemata: for label1 in schemata: # Increment the count once for each chord class associated # with this schema: schemata with 2 chord classes get 2 # counts for cclass in chord_class_mapping[label1]: schema_transition_counts[label0].inc(label1) for root_change in range(12): # Give one count to the root transition corresponding to this state transition root_transition_counts[(label0,label1)].inc(root_change) # Give a count to finishing in this state schema_transition_counts[label0].inc(None) # Estimate distribution from this frequency distribution schema_trans_dist = ConditionalProbDist(schema_transition_counts, mle_estimator, None) root_trans_dist = ConditionalProbDist(root_transition_counts, mle_estimator, None) # Sample this to get dictionary prob dists schema_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(schema_trans_dist) root_trans_dist = cond_prob_dist_to_dictionary_cond_prob_dist(root_trans_dist) # Do the same with the initial states (just schemata, not roots) initial_state_counts = FreqDist() for label in schemata: initial_state_counts.inc(label) initial_state_dist = mle_estimator(initial_state_counts, None) initial_state_dist = prob_dist_to_dictionary_prob_dist(initial_state_dist) # Also initialize the notes number distribution to uniform emission_number_counts = FreqDist() for i in range(max_notes): emission_number_counts.inc(i) emission_number_dist = mle_estimator(emission_number_counts, None) emission_number_dist = prob_dist_to_dictionary_prob_dist(emission_number_dist) # Create the model model = cls(schema_trans_dist, root_trans_dist, emission_dist, emission_number_dist, initial_state_dist, schemata, chord_class_mapping, classes, metric=metric, illegal_transitions=illegal_transitions, fixed_root_transitions=fixed_root_transitions) model.add_history(\ "Initialized model to chord type probabilities, using "\ "tetrad probability %s. Metric: %s" % \ (tetrad_prob, metric)) return model