def process_corpus(tr_in_filename, te_in_filename, u_in_filename, tr_out_filename, te_out_filename, u_out_filename): input_f = open(tr_in_filename, 'r') tr_original_corpus = pickle.load(input_f) input_f.close() input_f = open(te_in_filename, 'r') te_original_corpus = pickle.load(input_f) input_f.close() input_f = open(u_in_filename, 'r') u_original_corpus = pickle.load(input_f) input_f.close() tr_instances = [d['question'] for d in tr_original_corpus if '' not in d['target']] te_instances = [d['question'] for d in te_original_corpus if '' not in d['target']] u_instances = [d['question'] for d in u_original_corpus if ((not 'target' in d) or '' not in d['target'])] vect = get_features() vect.fit(tr_instances + te_instances + u_instances) v_instances = vect.transform(tr_instances + te_instances + u_instances) v_instances = csr_matrix(v_instances > 0, dtype=int8) print v_instances.shape tr_corpus = Corpus() tr_corpus.instances = v_instances[:len(tr_instances)] tr_corpus.full_targets = [d['target'] for d in tr_original_corpus if '' not in d['target']] tr_corpus.representations = [_get_repr(i[0]) for i in tr_instances] tr_corpus._features_vectorizer = vect tr_corpus.save_to_file(tr_out_filename) te_corpus = Corpus() te_corpus.instances = v_instances[:len(te_instances)] te_corpus.full_targets = [d['target'] for d in te_original_corpus if '' not in d['target']] te_corpus.representations = [_get_repr(i[0]) for i in te_instances] te_corpus._features_vectorizer = vect te_corpus.save_to_file(te_out_filename) u_corpus = Corpus() u_corpus.instances = v_instances[:len(u_instances)] u_corpus.full_targets = [d['target'] if ('target' in d and '' not in d['target']) else [] for d in u_original_corpus] u_corpus.representations = [_get_repr(i[0]) for i in u_instances] u_corpus._features_vectorizer = vect u_corpus.save_to_file(u_out_filename)
class ActivePipeline(object): """ Attributes: session_filename: emulate: A boolean. If is set to True, the pipe will search for labels in the unlabeled_corpus and in the feature_corpus and will only ask the user if there is no information available. training_corpus: unlabeled_corpus: test_corpus: feature_corpus: A matrix of shape [n_class, n_feat] with three possible values. -1 indicates that the feature was never asked to the user for that class, 0 indicates no relation, and 1 indicates relation between feature and class. The feature corpus will be loaded from the file self.feature_label_f intruduced by the config, and will be used only during user emulation. It can be updated using the function label_feature_corpus. recorded_precision: new_instances: new_features: classes: user_features: user_corpus: """ def __init__(self, session_filename='', emulate=False, **kwargs): """ Args: session_filename: Optional. The name of a file storing a session that will be loaded using the method load_session. emulate: a boolean. Will set the attribute emulate acordinly. **kwargs: the configuration for the pipe. Each parameters passed will be converted to an attribute of the pipe. The minimum configuration possible is set in the defaults file, and each value not passed as a parameter will be taken from there. """ self.session_filename = session_filename self.emulate = emulate self._set_config(kwargs) self._get_corpus() self._get_feature_corpus() self.recorded_precision = [] self.load_session() self.user_features = None self.new_instances = 0 self.new_features = 0 self.classes = [] self._train() self._build_feature_boost() def _set_config(self, config): """Sets the keys of config+default_config dict as an attribute of self. """ default_config.update(config) for key, value in default_config.items(): if value is not None: setattr(self, key, value) def _get_corpus(self): self.training_corpus = Corpus() self.training_corpus.load_from_file(self.training_corpus_f) self.unlabeled_corpus = Corpus() self.unlabeled_corpus.load_from_file(self.u_corpus_f) self.test_corpus = Corpus() self.test_corpus.load_from_file(self.test_corpus_f) self.user_corpus = Corpus() def _get_feature_corpus(self): """Loads the feature corpus from self.feature_corpus_f""" f = open(self.feature_corpus_f, 'r') self.feature_corpus = pickle.load(f) f.close() def _build_feature_boost(self): """Creates the user_features np.array with defaults values.""" self.alpha = self.classifier.alpha self.n_class, self.n_feat = self.classifier.feature_log_prob_.shape self.user_features = np.array([[self.alpha] * self.n_feat] * self.n_class) if self.emulate: self.asked_features = self.feature_corpus == 0 else: self.asked_features = self.user_features != self.alpha # False def _train(self): """Fit the classifier with the training set plus the new vectors and features. Then performs a step of EM. """ try: if len(self.user_corpus): self.classifier.fit( vstack((self.training_corpus.instances, self.user_corpus.instances), format='csr'), (self.training_corpus.primary_targets + self.user_corpus.primary_targets), features=self.user_features ) else: self.classifier.fit(self.training_corpus.instances, self.training_corpus.primary_targets, features=self.user_features) except ValueError: import ipdb; ipdb.set_trace() self.recorded_precision.append({ 'testing_precision' : self.evaluate_test(), 'training_precision' : self.evaluate_training(), 'new_instances' : self.new_instances, 'new_features' : self.new_features, 'confusion_matrix': confusion_matrix( self.test_corpus.primary_targets, self.predict(self.test_corpus.instances) ), 'feature_boost': self.feature_boost }) self.new_instances = 0 self.new_features = 0 self.classes = self.classifier.classes_.tolist() self._retrained = True def _expectation_maximization(self): """Performs one cicle of expectation maximization. Re estimates the parameters of the multinomial (class_prior and feature_log_prob_) to maximize the expected likelihood. The likelihood is calculated with a probabilistic labeling of the unlabeled corpus plus the known labels from the labeled corpus. """ # E-step: Classify the unlabeled pool predicted_proba = self.classifier.predict_proba( self.unlabeled_corpus.instances ) # M-step: Maximizing the likelihood # Unlabeled component instance_proba = self.classifier.instance_proba( self.unlabeled_corpus.instances ) predicted_proba = predicted_proba.T * instance_proba class_prior = predicted_proba.sum(axis=1) feature_prob = safe_sparse_dot(predicted_proba, self.unlabeled_corpus.instances) if len(self.training_corpus) != 0: # Labeled component instance_proba = self.classifier.instance_proba( self.training_corpus.instances ) instance_class_matrix = self._get_instance_class_matrix() predicted_proba = instance_class_matrix.T * instance_proba l_class_prior = predicted_proba.sum(axis=1) l_feat_prob = safe_sparse_dot(predicted_proba, self.training_corpus.instances) class_prior = 0.1 * class_prior + 0.9 * l_class_prior # Aca no deberia sumarle el alpha???? feature_prob = 0.1 * feature_prob + 0.9 * l_feat_prob self.classifier.class_log_prior_ = np.log(class_prior / class_prior.sum()) self.classifier.feature_log_prob_ = np.log(normalize(feature_prob, norm='l1')) def _get_instance_class_matrix(self): """Returns a binary matrix for the training instances and its labels. Returns: An array like, shape = [n_instances, n_class]. Each element is one if the instances is labeled with the class in the training corpus. """ m1 = np.arange(len(self.classes)) m1 = m1.reshape((1, len(self.classes))) m1 = np.repeat(m1, len(self.training_corpus), axis=0) m2 = np.zeros((len(self.training_corpus), len(self.classes))) for i in range(len(self.training_corpus)): class_index = self.classes.index( self.training_corpus.primary_targets[i] ) m2[i] = class_index result = (m1 == m2).astype(np.int8, copy=False) assert np.all(result.sum(axis=1) == 1) assert result.sum() == len(self.training_corpus) return result def predict(self, question): return self.classifier.predict(question) def instance_bootstrap(self, get_labeled_instance, max_iterations=None): """Presents a new question to the user until the answer is 'stop'. Args: get_labeled_instance: A function that takes the representation of an instance and a list of possible classes. Returns the correct class for the instance. max_iterations: Optional. An interger. The cicle will execute at most max_iterations times if the user does not enter stop before. Returns: The number of instances the user has labeled. """ it = 0 result = 0 while ((not max_iterations or it < max_iterations) and len(self.unlabeled_corpus)): it += 1 new_index = self.get_next_instance() try: new_instance = self.unlabeled_corpus.instances[new_index] except IndexError: import ipdb; ipdb.set_trace() representation = self.unlabeled_corpus.representations[new_index] if (self.emulate and self.unlabeled_corpus.primary_targets[new_index]): prediction = self.unlabeled_corpus.primary_targets[new_index] message = "Emulation: Adding instance {}, {}".format( representation, prediction ) print colored(message, "red") if (not self.emulate or not self.unlabeled_corpus.primary_targets[new_index]): classes = self._most_probable_classes(new_instance) prediction = get_labeled_instance(representation, classes) if prediction == 'stop': break if prediction == 'train': self._train() self._expectation_maximization() continue # if prediction == 'other': # self.unlabeled_corpus.pop_instance(new_index) # continue self.new_instances += 1 result += 1 instance, targets, r = self.unlabeled_corpus.pop_instance(new_index) self.user_corpus.add_instance( instance, [prediction] + targets, r ) return result def feature_bootstrap(self, get_class, get_labeled_features, max_iterations=None): """Presents a class and possible features until the prediction is stop. Args: get_class: A function that receives a list of classes and returns one of them. Can return None in case of error. get_labeled_features: A function that receives a class and a list of features. It must return a list of features associated with the class. Can return None in case of error. max_iterations: Optional. An interger. The cicle will execute at most max_iterations times if the user does not enter stop before. Returns: The number of features the user has labeled. """ result = 0 while not max_iterations or result < max_iterations: class_name = get_class(self.get_class_options()) if not class_name: continue if class_name == 'stop': break if class_name == 'train': self._train() self._expectation_maximization() continue class_number = self.classes.index(class_name) feature_numbers = self.get_next_features(class_number) e_prediction = [] prediction = [] if self.emulate: e_prediction = [f for f in feature_numbers if self.feature_corpus[class_number][f] == 1] feature_numbers = [f for f in feature_numbers if f not in e_prediction] print "Adding {0} features from corpus for class {1}".format( len(e_prediction), class_name ) if feature_numbers: feature_names = [self.training_corpus.get_feature_name(pos) for pos in feature_numbers] prediction = get_labeled_features(class_name, feature_names) if prediction == None and not e_prediction: continue if prediction == 'stop': break if prediction == 'train': self._train() self._expectation_maximization() continue prediction = [feature_numbers[feature_names.index(f)] for f in prediction] self.handle_feature_prediction(class_number, feature_numbers + e_prediction, prediction + e_prediction) result += len(prediction + e_prediction) return result def handle_feature_prediction(self, class_number, full_set, prediction): """Adds the new information from prediction to user_features. Args: class_number: an interger. The position of the class in self.classes full_set: a list of positions of features that was given to the user. prediction: a list of positions of features selected for the class. The features not present in this class are considered as negative examples. """ for feature in full_set: if feature in prediction: self.user_features[class_number][feature] += \ self.feature_boost self.asked_features[class_number][feature] = True self.new_features += len(prediction) def _most_probable_classes(self, instance): """Return a list of the most probable classes for the given instance. Args: instance: a vector with the instance to be classified Returns: A list of classes of len given by the number_of_classes in the initial configuration. """ classes = self.classifier.predict_log_proba(instance) indexes = classes.argsort() result = [] indexes = indexes[0].tolist() indexes.reverse() for index in indexes[:self.number_of_classes]: result.append(self.classes[index]) result.append(self.classes[-1]) return result def get_next_instance(self): """Selects the index of an unlabeled instance to be sent to the user. Returns: The index of an instance selected from the unlabeled_corpus. """ if len(self.unlabeled_corpus) == 0: return None if self._retrained: self.u_clasifications = self.classifier.predict_proba( self.unlabeled_corpus.instances ) entropy = self.u_clasifications * np.log(self.u_clasifications) entropy = entropy.sum(axis=1) entropy *= -1 self.unlabeled_corpus.add_extra_info('entropy', entropy.tolist()) self._retrained = False # Select the instance min_entropy = min(self.unlabeled_corpus.extra_info['entropy']) return self.unlabeled_corpus.extra_info['entropy'].index(min_entropy) def get_class_options(self): """Sorts a list of classes to present to the user by relevance. The user will choose one to label features associated with the class. Returns: A list of classes. """ return self.classes def get_next_features(self, class_number): """Selects a and a list of features to be sent to the oracle. Args: class_number: An interger. The position of the class where the features will belong in the np.array self.classes. Returns: A list of features numbers of size self.number_of_features. """ # Select the positions of the features that cooccur most with the class selected_f_pos = self.classifier.feature_count_[class_number].argsort() # Eliminate labeled features def non_seen_filter(i): return not self.asked_features[class_number][i] selected_f_pos = filter(non_seen_filter, selected_f_pos.tolist()) selected_f_pos = selected_f_pos[:-(self.number_of_features+1):-1] # Sort the features by IG def key_fun(i): return -1*self.classifier.feat_information_gain[i] selected_f_pos.sort(key=key_fun) return selected_f_pos # selected_f_pos = self.classifier.feat_information_gain.argsort()[:-100:-1] # coocurrence_with_class = [] # for feat_pos in selected_f_pos: # coocurrence_with_class.append( # self.classifier.feature_count_[class_number][feat_pos] # ) # coocurrence_with_class = np.array(coocurrence_with_class) # coocurrences_order = coocurrence_with_class.argsort() # res = [selected_f_pos[i] for i in coocurrences_order[::-1] # if self.user_features[class_number][selected_f_pos[i]] == # self.classifier.alpha] # return np.array(res[:self.number_of_features]) def evaluate_test(self): """Evaluates the classifier with the testing set. Returns: The score of the classifier over the test corpus """ return self.classifier.score(self.test_corpus.instances, self.test_corpus.primary_targets) def evaluate_training(self): """Evaluate the accuracy of the classifier with the labeled data. Returns: The score of the classifier over the training corpus """ # Agregamos la evidencia del usuario para evaluacion? return self.classifier.score(self.training_corpus.instances, self.training_corpus.primary_targets) def get_report(self): """ Returns: A sklearn.metrics.classification_report on the performance of the cassifier over the test corpus. """ predicted_targets = self.predict(self.test_corpus.instances) return classification_report(self.test_corpus.primary_targets, predicted_targets) def label_corpus(self): """Adds the user corpus to the unlabeled_corpus and saves it in a file. The filename must be passed into the configuration under the name u_corpus_f. """ if len(self.user_corpus): self.unlabeled_corpus.concetenate_corpus(self.user_corpus) self.unlabeled_corpus.save_to_file(self.u_corpus_f) def label_feature_corpus(self): """Adds user_features and asked_features in feature_corpus and saves it. The filename must be passed into the configuration under the name feature_corpus_f. """ self.feature_corpus = np.where(self.asked_features, np.zeros((self.n_class, self.n_feat)), self.feature_corpus) self.feature_corpus = np.where( self.user_features > self.alpha, np.ones((self.n_class, self.n_feat)), self.feature_corpus ) f = open(self.feature_corpus_f, 'w') pickle.dump(self.feature_corpus, f) f.close() def save_session(self, filename): """Saves the instances and targets introduced by the user in filename. Writes a pickle tuple in the file that can be recovered using the method load_session. Returns: False in case of error, True in case of success. """ if not filename: return False if not (len(self.user_corpus) != None or self.user_features != None): return False f = open(filename, 'w') to_save = {#'training_corpus': self.training_corpus, #'unlabeled_corpus': self.unlabeled_corpus, #'user_corpus': self.user_corpus, #'user_features': self.user_features, 'recorded_precision': self.recorded_precision, 'asked_features': (self.asked_features if hasattr(self, 'asked_features') else None), 'classification_report': self.get_report(), 'classes': self.classes } pickle.dump(to_save, f) f.close() return True def load_session(self): """Loads the instances and targets stored on filename. Overwrites the previous answers of the users. Args: filename: a string. The name of a file that has a pickle tuple. The first element of the tuple is a list of vectors, the second is a list of targets. Returns: False in case of error, True in case of success. """ if not self.session_filename: return False f = open(self.session_filename, 'r') loaded_data = pickle.load(f) f.close() self.training_corpus = loaded_data['training_corpus'] self.unlabeled_corpus = loaded_data['unlabeled_corpus'] self.user_corpus = loaded_data['user_corpus'] self.user_features = loaded_data['user_features'] self.recorded_precision = loaded_data['recorded_precision'] self.asked_features = loaded_data['asked_features'] return True
class TestCorpus(unittest.TestCase): def setUp(self): self.co = Corpus() self.co.instances = csr_matrix([[1, 2, 3], [4, 5, 6]]) self.co.full_targets = [[1], [2,3]] self.co.representations = ['representation1', 'representation2'] self.co.calculate_primary_targets() self.co.add_extra_info('extra_info1') def tearDown(self): self.assertTrue(self.co.check_consistency()) def test_load_and_save(self): """Load and save functions must be inverses.""" filename = 'testing_file' self.co.save_to_file(filename) new_co = Corpus() new_co.load_from_file(filename) self.assertTrue(_eq_crs_matrix(new_co.instances, self.co.instances)) for index in range(len(self.co)): self.assertEqual(self.co.full_targets[index], new_co.full_targets[index]) self.assertEqual(self.co.representations[index], new_co.representations[index]) self.assertIsNotNone(new_co.primary_targets) def test_add_instance(self): self.co.add_instance([2, 3, 4], [2], 'representation3') self.assertEqual(len(self.co), 3) self.assertTrue(_eq_crs_matrix(csr_matrix([2, 3, 4]), self.co.instances[-1])) def test_add_extra_info(self): self.assertEqual(len(self.co.extra_info), 1) self.assertIn('extra_info1', self.co.extra_info) def test_add_extra_info_twice(self): self.co.add_extra_info('extra_info1', values=[1, 1]) self.assertEqual(len(self.co.extra_info), 1) self.assertIn('extra_info1', self.co.extra_info) self.assertEqual(self.co.extra_info['extra_info1'], [1, 1]) def test_add_extra_info_second(self): self.co.add_extra_info('extra_info2', values=[1, 1]) self.assertEqual(len(self.co.extra_info), 2) self.assertIn('extra_info2', self.co.extra_info) self.assertEqual(self.co.extra_info['extra_info2'], [1, 1]) def test_pop_first_instance(self): result = self.co.pop_instance(0) # check result self.assertEqual(len(result), 3) self.assertTrue(_eq_crs_matrix(csr_matrix([1, 2, 3]), result[0])) self.assertEqual(result[1], [1]) self.assertEqual(result[2], 'representation1') #check corpus self.assertEqual(len(self.co), 1) self.assertTrue(_eq_crs_matrix(csr_matrix([4, 5, 6]), self.co.instances[0])) self.assertEqual(self.co.full_targets[0], [2, 3]) self.assertEqual(self.co.representations[0], 'representation2') def test_pop_last_instance(self): result = self.co.pop_instance(1) # check result self.assertEqual(len(result), 3) self.assertTrue(_eq_crs_matrix(csr_matrix([4, 5, 6]), result[0])) self.assertEqual(result[1], [2, 3]) self.assertEqual(result[2], 'representation2') #check corpus self.assertEqual(len(self.co), 1) self.assertTrue(_eq_crs_matrix(csr_matrix([1, 2, 3]), self.co.instances[0])) self.assertEqual(self.co.full_targets[0], [1]) self.assertEqual(self.co.representations[0], 'representation1') def test_pop_middle_instace(self): self.co.add_instance([2, 3, 4], [2], 'representation3') result = self.co.pop_instance(1) self.assertEqual(len(self.co), 2) self.assertEqual(len(result), 3) self.assertTrue(_eq_crs_matrix(csr_matrix([4, 5, 6]), result[0])) self.assertEqual(result[1], [2, 3]) self.assertEqual(result[2], 'representation2') def test_pop_last_instance(self): self.co.pop_instance(1) self.co.pop_instance(0) self.assertEqual(len(self.co), 0) def test_concatenate_corpus(self): new_corpus = Corpus() new_corpus.add_instance([2, 3, 4], [2], 'representation3') new_corpus.add_instance([10, 4, 4], [1, 1, 2], 'representation3') new_corpus.calculate_primary_targets() self.assertTrue(new_corpus.add_extra_info('extra_info1')) self.co.concetenate_corpus(new_corpus) self.assertEqual(len(self.co), 4) def test_concatenate_corpus_no_extra_info(self): new_corpus = Corpus() new_corpus.add_instance([2, 3, 4], [2], 'representation3') new_corpus.add_instance([10, 4, 4], [1, 1, 2], 'representation3') new_corpus.calculate_primary_targets() self.co.concetenate_corpus(new_corpus) self.assertEqual(len(self.co), 4) def test_concatenate_empty_corpus(self): new_corpus = Corpus() self.assertTrue(new_corpus.add_extra_info('extra_info1')) self.co.concetenate_corpus(new_corpus) self.assertEqual(len(self.co), 2) def test_calculate_primary_targets(self): self.assertEqual(self.co.primary_targets, [1,2]) def test_primary_targets_none(self): self.co.add_instance([0, 2, 3], [], 'r') self.co.calculate_primary_targets() self.assertEqual(self.co.primary_targets, [1, 2, None]) def test_primary_targets_mode(self): self.co.add_instance([0, 2, 3], [1, 4, 4, 3, 3], 'r') self.co.calculate_primary_targets() self.assertEqual(self.co.primary_targets, [1, 2, 3]) def test_len(self): self.assertEqual(len(self.co), 2) def test_len_empty(self): new_corpus = Corpus() self.assertEqual(len(new_corpus), 0)
def count_by_class(corpus): """Returns a dictionary with the number of instances by class""" result = defaultdict(lambda: 0) for target in co.primary_targets: result[target] += 1 return result c_by_class = count_by_class(co) for k, v in c_by_class.items(): print k, v limit = sorted(c_by_class.values())[-2] # limit = 10 to_remove = c_by_class['other'] - limit to_remove = {k: c_by_class[k] - limit for k in c_by_class} print to_remove, limit for i in range(len(co)-1, 0, -1): target = co.primary_targets[i] if to_remove[target] > 0: co.pop_instance(i) to_remove[target] -= 1 c_by_class = count_by_class(co) for k, v in c_by_class.items(): print k, v co.save_to_file('experimental/unlabeled_new_corpus_balanced2.pickle')