def learn_parameters(self): self.parameters = LabelParameters( self.words_count_from(self.all_labelled_data()), self.labels) for label in self.labels: self.parameters.process( self.words_count_from(self.labelled_data[label]), label) self.parameters.precompute()
class NaiveLearner: def __init__(self, data): self.all_data = data self.populate_training_data(.9) self.learn_parameters() def populate_training_data(self, ratio): self.labels = [] self.labelled_data = {} self.test_data = {} for data in self.all_data: if data.label in self.labelled_data: self.labelled_data[data.label].append(data) else: self.labels.append(data.label) self.labelled_data[data.label] = [data] for label, labeled_data in self.labelled_data.iteritems(): indices_to_include = random.sample(range(len(labeled_data)), int(len(labeled_data) * ratio)) self.test_data[label] = [] for i in range(len(labeled_data)): if i not in indices_to_include: self.test_data[label].append(self.labelled_data[label][i]) self.labelled_data[label] = [self.labelled_data[label][i] for i in indices_to_include] def learn_parameters(self): self.parameters = LabelParameters(self.words_count_from(self.all_labelled_data()), self.labels) for label in self.labels: self.parameters.process(self.words_count_from(self.labelled_data[label]), label) self.parameters.precompute() def words_count_from(self, data): words_count = {} for item in data: words = re.compile('\w+').findall(item.text) for word in words: if word in words_count: words_count[word] += 1 else: words_count[word] = 1 return words_count def all_labelled_data(self): return reduce(lambda acc, x: acc + x, self.labelled_data.values(), []) def sampled_label_probability(self, label): count = 0 for labelled_data in self.labelled_data.values(): count += len(labelled_data) return 1.0 * len(self.labelled_data[label]) / count def sampled_label_probabilities(self): probabilities = {} for label in self.labels: probabilities[label] = self.sampled_label_probability(label) return probabilities
def read_model(self, param_reader): label_word_count = {} prior_probability = {} row = param_reader.next() labels = row[1:] for label in labels: label_word_count[label] = {} row = param_reader.next() for i, prior in enumerate(row[1:]): prior_probability[labels[i]] = float(prior) for row in param_reader: if len(row) == 1: break word = row[0] for i, word_count in enumerate(row[1:]): label_word_count[labels[i]][word] = int(word_count) parameters = LabelParameters(label_word_count[labels[0]], labels) for label in labels: parameters.process(label_word_count[label], label) parameters.precompute() return parameters, prior_probability
def learn_parameters(self): self.parameters = LabelParameters(self.words_count_from(self.all_labelled_data()), self.labels) for label in self.labels: self.parameters.process(self.words_count_from(self.labelled_data[label]), label) self.parameters.precompute()
class NaiveLearner: def __init__(self, data): self.all_data = data self.populate_training_data(.9) self.learn_parameters() def populate_training_data(self, ratio): self.labels = [] self.labelled_data = {} self.test_data = {} for data in self.all_data: if data.label in self.labelled_data: self.labelled_data[data.label].append(data) else: self.labels.append(data.label) self.labelled_data[data.label] = [data] for label, labeled_data in self.labelled_data.iteritems(): indices_to_include = random.sample(range(len(labeled_data)), int(len(labeled_data) * ratio)) self.test_data[label] = [] for i in range(len(labeled_data)): if i not in indices_to_include: self.test_data[label].append(self.labelled_data[label][i]) self.labelled_data[label] = [ self.labelled_data[label][i] for i in indices_to_include ] def learn_parameters(self): self.parameters = LabelParameters( self.words_count_from(self.all_labelled_data()), self.labels) for label in self.labels: self.parameters.process( self.words_count_from(self.labelled_data[label]), label) self.parameters.precompute() def words_count_from(self, data): words_count = {} for item in data: words = re.compile('\w+').findall(item.text) for word in words: if word in words_count: words_count[word] += 1 else: words_count[word] = 1 return words_count def all_labelled_data(self): return reduce(lambda acc, x: acc + x, self.labelled_data.values(), []) def sampled_label_probability(self, label): count = 0 for labelled_data in self.labelled_data.values(): count += len(labelled_data) return 1.0 * len(self.labelled_data[label]) / count def sampled_label_probabilities(self): probabilities = {} for label in self.labels: probabilities[label] = self.sampled_label_probability(label) return probabilities