class RareCategoryDetectionAnnotationQueries(AnnotationQueries): def __init__(self, iteration, label, proba_min, proba_max, multiclass_model = None, multiclass_exp = None): AnnotationQueries.__init__(self, iteration, label) self.proba_min = proba_min self.proba_max = proba_max self.rare_category_detection_conf = self.iteration.experiment.conf.rare_category_detection_conf self.multiclass_model = multiclass_model self.multiclass_exp = multiclass_exp def run(self, already_queried = None): self.runModels(already_queried = already_queried) start_time = time.time() self.generateAnnotationQueries() self.generate_queries_time = time.time() - start_time self.exportAnnotationQueries() self.generateClusteringVisualization() def runModels(self, already_queried = None): df = matrix_tools.extractRowsWithThresholds(self.predictions, self.proba_min, self.proba_max, 'predicted_proba') if already_queried is not None: self.predicted_ids = list(set(df.index).difference(set(already_queried))) else: self.predicted_ids = list(df.index) datasets = self.iteration.datasets self.annotated_instances = datasets.getAnnotatedInstances(label = self.label) self.families_analysis = self.familiesAnalysis() if self.families_analysis: self.annotations_type = 'families' start_time = time.time() self.buildCategories() self.analysis_time = time.time() - start_time self.categories.setLikelihood(self.iteration.iteration_number) else: self.annotations_type = 'individual' self.categories = None self.analysis_time = 0 def generateAnnotationQueries(self): num_annotations = self.rare_category_detection_conf.num_annotations if not self.families_analysis: selected_instances = self.predicted_ids if len(selected_instances) > num_annotations: selected_instances = random.sample(selected_instances, num_annotations) self.annotation_queries = [] for instance_id in selected_instances: predicted_proba = self.predictions.loc[instance_id]['predicted_proba'] query = AnnotationQuery(instance_id, predicted_proba, self.label, None) self.annotation_queries.append(query) else: self.categories.generateAnnotationQueries(self.rare_category_detection_conf) def exportAnnotationQueries(self): if not self.families_analysis: AnnotationQueries.exportAnnotationQueries(self) else: filename = self.iteration.output_directory filename += 'toannotate_' + self.label + '.json' self.categories.exportAnnotationQueries(filename) def annotateAuto(self): if not self.families_analysis: AnnotationQueries.annotateAuto(self) else: self.categories.annotateAuto(self.iteration) def getManualAnnotations(self): if not self.families_analysis: AnnotationQueries.getManualAnnotations(self) else: self.categories.getManualAnnotations(self.iteration) def checkAnnotationQueriesAnswered(self): if not self.families_analysis: return AnnotationQueries.checkAnnotationQueriesAnswered(self) else: return self.categories.checkAnnotationQueriesAnswered(self.iteration) ####################### ### Private methods ### ####################### def createMulticlassExperiment(self): conf = self.rare_category_detection_conf.classification_conf exp = self.iteration.experiment name = '-'.join(['AL' + str(exp.experiment_id), 'Iter' + str(self.iteration.iteration_number), self.label, 'analysis']) multiclass_exp = ClassificationExperiment(exp.project, exp.dataset, exp.session, experiment_name = name, labels_id = exp.labels_id, parent = exp.experiment_id) multiclass_exp.setFeaturesFilenames(exp.features_filenames) multiclass_exp.setClassifierConf(conf) multiclass_exp.createExperiment() multiclass_exp.export() return multiclass_exp def buildCategories(self): multiclass_exp = self.buildMulticlassClassifier() train = self.multiclass_model.datasets.train_instances test = self.multiclass_model.datasets.test_instances all_instances = copy.deepcopy(test) all_instances.union(train) if test.numInstances() > 0: predicted_families = self.multiclass_model.testing_monitoring.getPredictedLabels() all_families = list(predicted_families) + train.families predicted_proba = self.multiclass_model.testing_monitoring.getAllPredictedProba() for family in train.families: probas = [int(family == s) for s in self.multiclass_model.class_labels] predicted_proba = np.vstack((predicted_proba, np.array(probas))) else: all_families = self.annotated_instances.families predicted_proba = None for family in self.annotated_instances.families: probas = [int(family == s) for s in self.multiclass_model.class_labels] if predicted_proba is None: predicted_proba = np.array(probas) else: predicted_proba = np.vstack((predicted_proba, np.array(probas))) labels_values = list(self.multiclass_model.class_labels) assigned_categories = [labels_values.index(x) for x in all_families] self.categories = Categories(multiclass_exp, all_instances, assigned_categories, predicted_proba, self.label, self.multiclass_model.class_labels) def buildMulticlassClassifier(self): if self.multiclass_model is not None: return self.multiclass_exp multiclass_exp = self.createMulticlassExperiment() datasets = self.iteration.datasets predicted_instances = datasets.getInstancesFromIds(self.predicted_ids) multiclass_datasets = ClassifierDatasets(multiclass_exp.classification_conf) multiclass_datasets.train_instances = self.annotated_instances multiclass_datasets.test_instances = predicted_instances multiclass_datasets.setSampleWeights() self.multiclass_model = multiclass_exp.classification_conf.model_class( multiclass_exp.classification_conf, multiclass_datasets, cv_monitoring = True) self.multiclass_model.run(multiclass_exp.getOutputDirectory(), multiclass_exp) return multiclass_exp # A multi class supervised model is learned from the annotated instances if: # - there are at most 2 families # - the second most represented family has at least 2 instances def familiesAnalysis(self): num_families = len(self.annotated_instances.getFamiliesValues()) if num_families < 2: return False families_counts = self.annotated_instances.getFamiliesCount() families_counts = [(k, x) for k, x in families_counts.iteritems()] families_counts.sort(key=lambda tup: tup[1], reverse = True) if families_counts[1][1] < 2: return False return True def createClusteringExperiment(self): conf = ClusteringConfiguration(self.categories.numCategories()) exp = self.iteration.experiment name = '-'.join(['AL' + str(exp.experiment_id), 'Iter' + str(self.iteration.iteration_number), self.label, 'clustering']) clustering_exp = ClusteringExperiment(exp.project, exp.dataset, exp.session, conf, labels_id = exp.labels_id, experiment_name = name, parent = exp.experiment_id) clustering_exp.setFeaturesFilenames(exp.features_filenames) clustering_exp.createExperiment() clustering_exp.export() return clustering_exp def generateClusteringVisualization(self): if self.families_analysis: self.clustering_exp = self.createClusteringExperiment() clustering = Clustering(self.categories.instances, self.categories.assigned_categories) clustering.generateClustering(self.clustering_exp.getOutputDirectory(), None, None) else: self.clustering_exp = None