def checkInputProjectDatasetDir(self, logger): # Check project directory project_dir = dir_exp_tools.getProjectDirectory( self.secuml_conf, self.project) if not path.isdir(project_dir): raise ProjectDirNotFound(self.secuml_conf.input_data_dir, self.project) # Check dataset directory dataset_dir = dir_exp_tools.getDatasetDirectory( self.secuml_conf, self.project, self.dataset) if not path.isdir(dataset_dir): raise DatasetDirNotFound(self.secuml_conf.input_data_dir, self.project, self.dataset) # Check idents file self.idents_filename = dir_exp_tools.getIdentsFilename( self.secuml_conf, self.project, self.dataset) if not path.isfile(self.idents_filename): raise IdentsFileNotFound(self.idents_filename) # Check ground-truth file self.annotations_filename = dir_exp_tools.getGroundTruthFilename( self.secuml_conf, self.project, self.dataset) if not dir_tools.checkFileExists(self.annotations_filename): logger.warning('No ground-truth available for the dataset %s/%s.' % (self.project, self.dataset)) self.annotations_filename = None
def checkWebLibraries(): lib_dir = path.join(SECUML_DIR, 'web', 'static', 'lib') web_urls = getWebUrls() for k in ['js', 'css']: directory = path.join(lib_dir, k) libs = [path.basename(urlparse(u).path) for u in web_urls[k]] for lib in libs: if not dir_tools.checkFileExists(path.join(directory, lib)): raise MissingWebLibraries()
def runNextIteration(experiment_id, iteration_number): res = str(celeryRunNextIteration.s().apply_async()) if user_exp: experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.getOutputDirectory(), 'user_actions.log') file_exists = dir_tools.checkFileExists(filename) mode = 'a' if file_exists else 'w' to_print = [datetime.datetime.now(), 'nextIteration', iteration_number] to_print = list(map(str, to_print)) to_print = ','.join(to_print) with open(filename, mode) as f: f.write(to_print) return res
def currentAnnotations(experiment_id, iteration): experiment = updateCurrentExperiment(experiment_id) page = render_template('ActiveLearning/current_annotations.html', project=experiment.project) if user_exp: filename = path.join(experiment.getOutputDirectory(), 'user_actions.log') file_exists = dir_tools.checkFileExists(filename) mode = 'a' if file_exists else 'w' to_print = [datetime.datetime.now(), 'displayAnnotatedInstances'] to_print = list(map(str, to_print)) to_print = ','.join(to_print) with open(filename, mode) as f: f.write(to_print) return page
def removeAnnotation(experiment_id, inst_experiment_id, iteration_number, instance_id): annotations_db_tools.removeAnnotation(session, inst_experiment_id, instance_id) if user_exp: experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.getOutputDirectory(), 'user_actions.log') file_exists = dir_tools.checkFileExists(filename) mode = 'a' if file_exists else 'w' to_print = [datetime.datetime.now(), 'removeAnnotation', instance_id] to_print = list(map(str, to_print)) to_print = ','.join(to_print) with open(filename, mode) as f: f.write(to_print) return ''
def mergeFamilies(experiment_id, label, families, new_family_name): families = families.split(',') annotations_db_tools.mergeFamilies(session, experiment_id, label, families, new_family_name) if user_exp: experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.getOutputDirectory(), 'user_actions.log') file_exists = dir_tools.checkFileExists(filename) mode = 'a' if file_exists else 'w' to_print = [datetime.datetime.now(), 'mergeFamilies', new_family_name] to_print += list(map(str, families)) to_print = list(map(str, to_print)) to_print = ','.join(to_print) with open(filename, mode) as f: f.write(to_print) return ''
def changeFamilyLabel(experiment_id, label, family): annotations_db_tools.changeFamilyLabel(session, experiment_id, label, family) if user_exp: experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.getOutputDirectory(), 'user_actions.log') file_exists = dir_tools.checkFileExists(filename) mode = 'a' if file_exists else 'w' to_print = [ datetime.datetime.now(), 'changeFamilyLabel', family, label ] to_print = list(map(str, to_print)) to_print = ','.join(to_print) with open(filename, mode) as f: f.write(to_print) return ''
def getFeaturesNamesDescriptions(self): features_file = self.getFeaturesFullpath() features_names = [] features_descriptions = [] basename, ext = path.splitext(features_file) features_description_file = basename + '_description.csv' if dir_tools.checkFileExists(features_description_file): with open(features_description_file, 'r') as f: df = pd.read_csv(f, header=0, index_col=0) features_names.extend(df['name']) features_descriptions.extend(df['description']) else: with open(features_file, 'r') as f_file: features_reader = csv.reader(f_file) f_features_names = next(features_reader) features_names.extend(f_features_names[1:]) features_descriptions.extend(f_features_names[1:]) return features_names, features_descriptions
def _setAnnotationsFilename(self, annotations_filename): if annotations_filename is None: annotations_type = 'none' elif annotations_filename == 'ground_truth.csv': annotations_type = 'ground_truth' else: annotations_type = 'partial_annotations' exp_annotations = db_tables.ExperimentAnnotationsAlchemy( experiment_id=self.experiment_id, annotations_type=annotations_type) self.session.add(exp_annotations) self.session.flush() self.annotations_id = exp_annotations.annotations_id self.annotations_type = annotations_type if annotations_type == 'partial_annotations': filename = path.join(dir_exp_tools.getDatasetDirectory( self.secuml_conf, self.project, self.dataset), 'annotations', annotations_filename) if not dir_tools.checkFileExists(filename): raise ValueError( 'The annotation file %s does not exist.' % filename) families = dir_exp_tools.annotationsWithFamilies(filename) conn = self.session.connection().connection cursor = conn.cursor() if self.secuml_conf.db_type == 'mysql': mysql_specific.loadPartialAnnotations(cursor, filename, families, self.annotations_id, self.dataset_id) if self.secuml_conf.db_type == 'postgresql': postgresql_specific.loadPartialAnnotations(cursor, filename, families, self.annotations_id, self.dataset_id) self.session.flush()
def _setAnnotationsFilename(self, annotations_filename): if annotations_filename is None: annotations_type = 'none' elif annotations_filename == 'ground_truth.csv': annotations_type = 'ground_truth' else: annotations_type = 'partial_annotations' exp_annotations = db_tables.ExperimentAnnotationsAlchemy( experiment_id=self.experiment_id, annotations_type=annotations_type) self.session.add(exp_annotations) self.session.commit() self.annotations_id = exp_annotations.annotations_id self.annotations_type = annotations_type if annotations_type == 'partial_annotations': filename = path.join( dir_exp_tools.getDatasetDirectory(self.project, self.dataset), 'annotations', annotations_filename) if not dir_tools.checkFileExists(filename): raise ValueError('The annotation file %s does not exist.' % filename) # Check whether the file contains families families = False with open(filename, 'r') as f: reader = csv.reader(f) header = next(reader) if len(header) == 3: families = True cursor = self.session.connection().connection.cursor() if db_tools.isMysql(): query = 'CREATE TEMPORARY TABLE labels_import(' query += 'instance_id integer, ' query += 'annotations_id integer DEFAULT ' + \ str(self.annotations_id) + ', ' query += 'user_instance_id integer, ' query += 'label varchar(200), ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'iteration integer DEFAULT 0, ' query += 'method varchar(200) DEFAULT \'init\'' query += ');' cursor.execute(query) query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' ' query += 'INTO TABLE ' + 'labels_import' + ' ' query += 'FIELDS TERMINATED BY \',\' ' query += 'IGNORE 1 LINES ' if families: query += '(user_instance_id, label, family) ' else: query += '(user_instance_id, label) ' query += ';' cursor.execute(query) query = 'UPDATE labels_import l ' query += 'JOIN instances i ' query += 'ON i.user_instance_id = l.user_instance_id ' query += 'AND i.dataset_id = ' + str(self.dataset_id) + ' ' query += 'SET l.instance_id = i.id;' cursor.execute(query) query = 'INSERT INTO annotations(instance_id,annotations_id,label,family,iteration,method) ' query += 'SELECT instance_id,annotations_id,label,family,iteration,method ' query += 'FROM labels_import;' cursor.execute(query) elif db_tools.isPostgresql(): query = 'CREATE TEMPORARY TABLE labels_import(' query += 'instance_id integer, ' query += 'annotations_id integer DEFAULT ' + \ str(self.annotations_id) + ', ' query += 'user_instance_id integer, ' query += 'label labels_enum, ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'iteration integer DEFAULT 0, ' query += 'method varchar(200) DEFAULT \'init\'' query += ');' cursor.execute(query) with open(filename, 'r') as f: if families: query = 'COPY labels_import(user_instance_id,label,family) ' else: query = 'COPY labels_import(user_instance_id,label) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'UPDATE labels_import AS l ' query += 'SET instance_id = i.id ' query += 'FROM instances AS i ' query += 'WHERE i.user_instance_id = l.user_instance_id ' query += 'AND i.dataset_id = ' + str(self.dataset_id) + ';' cursor.execute(query) query = 'INSERT INTO annotations(instance_id,annotations_id,label,family,iteration,method) ' query += 'SELECT instance_id,annotations_id,label,family,iteration,method ' query += 'FROM labels_import;' cursor.execute(query) self.session.commit()
def loadGroundTruth(self, logger): annotations_file = path.join(dir_exp_tools.getDatasetDirectory( self.project, self.dataset), 'annotations', 'ground_truth.csv') if not dir_tools.checkFileExists(annotations_file): logger.warning('No ground-truth available for this dataset') return # Check whether the file contains families families = False with open(annotations_file, 'r') as f: reader = csv.reader(f) header = next(reader) if len(header) == 3: families = True cursor = self.session.connection().connection.cursor() if db_tools.isMysql(): query = 'CREATE TEMPORARY TABLE ground_truth_import(' query += 'user_instance_id integer PRIMARY KEY, ' query += 'label varchar(200), ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'dataset_id integer DEFAULT ' + \ str(self.dataset_id) + ', ' query += 'id integer DEFAULT NULL' query += ');' cursor.execute(query) query = 'LOAD DATA LOCAL INFILE \'' + annotations_file + '\' ' query += 'INTO TABLE ' + 'ground_truth_import' + ' ' query += 'FIELDS TERMINATED BY \',\' ' query += 'IGNORE 1 LINES ' if families: query += '(user_instance_id, label, family) ' else: query += '(user_instance_id, label) ' query += ';' cursor.execute(query) query = 'UPDATE ground_truth_import t ' query += 'JOIN instances i ' query += 'ON i.user_instance_id = t.user_instance_id ' query += 'AND i.dataset_id = t.dataset_id ' query += 'SET t.id = i.id;' cursor.execute(query) query = 'INSERT INTO ground_truth(instance_id, dataset_id, label, family) ' query += 'SELECT t.id, t.dataset_id, t.label, t.family ' query += 'FROM ground_truth_import AS t;' cursor.execute(query) elif db_tools.isPostgresql(): query = 'CREATE TEMPORARY TABLE ground_truth_import(' query += 'user_instance_id integer PRIMARY KEY, ' query += 'label ground_truth_enum, ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'dataset_id integer DEFAULT ' + \ str(self.dataset_id) + ', ' query += 'id integer DEFAULT NULL' query += ');' cursor.execute(query) with open(annotations_file, 'r') as f: if families: query = 'COPY ground_truth_import(user_instance_id,label,family) ' else: query = 'COPY ground_truth_import(user_instance_id,label) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'UPDATE ground_truth_import AS t ' query += 'SET id = i.id ' query += 'FROM instances AS i ' query += 'WHERE i.user_instance_id = t.user_instance_id ' query += 'AND i.dataset_id = t.dataset_id;' cursor.execute(query) query = 'INSERT INTO ground_truth(instance_id, dataset_id, label, family) ' query += 'SELECT t.id, t.dataset_id, t.label, t.family ' query += 'FROM ground_truth_import AS t;' cursor.execute(query) self.session.commit()