def checkInputProjectDatasetDir(self, logger): # Check project directory project_dir = dir_exp_tools.getProjectDirectory( self.secuml_conf, self.project) if not path.isdir(project_dir): raise ProjectDirNotFound(self.secuml_conf.input_data_dir, self.project) # Check dataset directory dataset_dir = dir_exp_tools.getDatasetDirectory( self.secuml_conf, self.project, self.dataset) if not path.isdir(dataset_dir): raise DatasetDirNotFound(self.secuml_conf.input_data_dir, self.project, self.dataset) # Check idents file self.idents_filename = dir_exp_tools.getIdentsFilename( self.secuml_conf, self.project, self.dataset) if not path.isfile(self.idents_filename): raise IdentsFileNotFound(self.idents_filename) # Check ground-truth file self.annotations_filename = dir_exp_tools.getGroundTruthFilename( self.secuml_conf, self.project, self.dataset) if not dir_tools.checkFileExists(self.annotations_filename): logger.warning('No ground-truth available for the dataset %s/%s.' % (self.project, self.dataset)) self.annotations_filename = None
def saveAnnotatedInstances(self): filename = 'annotations_%s_exp%d_it%d.csv' % ( self.experiment.query_strategy, self.experiment.experiment_id, self.iteration_number) filename = path.join( dir_exp_tools.getDatasetDirectory(self.experiment.secuml_conf, self.experiment.project, self.experiment.dataset), 'annotations', filename) self.datasets.saveAnnotatedInstances(filename, self.experiment)
def saveAnnotatedInstances(self): filename = 'annotations_' filename += self.experiment.query_strategy + '_' filename += 'exp' + str(self.experiment.experiment_id) + '_' filename += 'it' + str(self.iteration_number) + '.csv' filename = path.join(dir_exp_tools.getDatasetDirectory( self.experiment.project, self.experiment.dataset), 'annotations', filename) self.datasets.saveAnnotatedInstances(filename)
def loadIdents(self): filename = path.join(dir_exp_tools.getDatasetDirectory( self.project, self.dataset), 'idents.csv') cursor = self.session.connection().connection.cursor() if db_tools.isMysql(): query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' ' query += 'INTO TABLE ' + 'instances' + ' ' query += 'CHARACTER SET UTF8 ' query += 'FIELDS TERMINATED BY \',\' ' query += 'OPTIONALLY ENCLOSED BY \'"\' ' query += 'IGNORE 1 LINES ' query += 'SET dataset_id = ' + str(self.dataset_id) + ',' query += 'row_number = NULL' query += ';' cursor.execute(query) query = 'SET @pos = 0;' cursor.execute(query) query = 'UPDATE instances SET row_number = ' query += '( SELECT @pos := @pos + 1 ) WHERE dataset_id = ' + \ str(self.dataset_id) query += ';' cursor.execute(query) elif db_tools.isPostgresql(): timestamps = False with open(filename, 'r') as f: reader = csv.reader(f) header = next(reader) if len(header) == 3: timestamps = True query = 'CREATE TEMPORARY TABLE instances_import(' query += 'user_instance_id integer, ' query += 'ident varchar(200), ' query += 'timestamp timestamp DEFAULT null,' query += 'dataset_id integer DEFAULT ' + str(self.dataset_id) + ',' query += 'row_number serial PRIMARY KEY' query += ');' cursor.execute(query) with open(filename, 'r') as f: if timestamps: query = 'COPY instances_import(user_instance_id,ident,timestamp) ' else: query = 'COPY instances_import(user_instance_id,ident) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'INSERT INTO instances(user_instance_id,ident,timestamp,dataset_id,row_number) ' query += 'SELECT user_instance_id, ident, timestamp, dataset_id, row_number ' query += 'FROM instances_import;' cursor.execute(query) self.session.commit()
def getFeaturesFullpath(self): dataset_dir = dir_exp_tools.getDatasetDirectory( self.project, self.dataset) features_directory = path.join(dataset_dir, 'features') full_path = path.join(features_directory, self.features_filename) return full_path
def _setAnnotationsFilename(self, annotations_filename): if annotations_filename is None: annotations_type = 'none' elif annotations_filename == 'ground_truth.csv': annotations_type = 'ground_truth' else: annotations_type = 'partial_annotations' exp_annotations = db_tables.ExperimentAnnotationsAlchemy( experiment_id=self.experiment_id, annotations_type=annotations_type) self.session.add(exp_annotations) self.session.commit() self.annotations_id = exp_annotations.annotations_id self.annotations_type = annotations_type if annotations_type == 'partial_annotations': filename = path.join( dir_exp_tools.getDatasetDirectory(self.project, self.dataset), 'annotations', annotations_filename) if not dir_tools.checkFileExists(filename): raise ValueError('The annotation file %s does not exist.' % filename) # Check whether the file contains families families = False with open(filename, 'r') as f: reader = csv.reader(f) header = next(reader) if len(header) == 3: families = True cursor = self.session.connection().connection.cursor() if db_tools.isMysql(): query = 'CREATE TEMPORARY TABLE labels_import(' query += 'instance_id integer, ' query += 'annotations_id integer DEFAULT ' + \ str(self.annotations_id) + ', ' query += 'user_instance_id integer, ' query += 'label varchar(200), ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'iteration integer DEFAULT 0, ' query += 'method varchar(200) DEFAULT \'init\'' query += ');' cursor.execute(query) query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' ' query += 'INTO TABLE ' + 'labels_import' + ' ' query += 'FIELDS TERMINATED BY \',\' ' query += 'IGNORE 1 LINES ' if families: query += '(user_instance_id, label, family) ' else: query += '(user_instance_id, label) ' query += ';' cursor.execute(query) query = 'UPDATE labels_import l ' query += 'JOIN instances i ' query += 'ON i.user_instance_id = l.user_instance_id ' query += 'AND i.dataset_id = ' + str(self.dataset_id) + ' ' query += 'SET l.instance_id = i.id;' cursor.execute(query) query = 'INSERT INTO annotations(instance_id,annotations_id,label,family,iteration,method) ' query += 'SELECT instance_id,annotations_id,label,family,iteration,method ' query += 'FROM labels_import;' cursor.execute(query) elif db_tools.isPostgresql(): query = 'CREATE TEMPORARY TABLE labels_import(' query += 'instance_id integer, ' query += 'annotations_id integer DEFAULT ' + \ str(self.annotations_id) + ', ' query += 'user_instance_id integer, ' query += 'label labels_enum, ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'iteration integer DEFAULT 0, ' query += 'method varchar(200) DEFAULT \'init\'' query += ');' cursor.execute(query) with open(filename, 'r') as f: if families: query = 'COPY labels_import(user_instance_id,label,family) ' else: query = 'COPY labels_import(user_instance_id,label) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'UPDATE labels_import AS l ' query += 'SET instance_id = i.id ' query += 'FROM instances AS i ' query += 'WHERE i.user_instance_id = l.user_instance_id ' query += 'AND i.dataset_id = ' + str(self.dataset_id) + ';' cursor.execute(query) query = 'INSERT INTO annotations(instance_id,annotations_id,label,family,iteration,method) ' query += 'SELECT instance_id,annotations_id,label,family,iteration,method ' query += 'FROM labels_import;' cursor.execute(query) self.session.commit()
def loadGroundTruth(self, logger): annotations_file = path.join(dir_exp_tools.getDatasetDirectory( self.project, self.dataset), 'annotations', 'ground_truth.csv') if not dir_tools.checkFileExists(annotations_file): logger.warning('No ground-truth available for this dataset') return # Check whether the file contains families families = False with open(annotations_file, 'r') as f: reader = csv.reader(f) header = next(reader) if len(header) == 3: families = True cursor = self.session.connection().connection.cursor() if db_tools.isMysql(): query = 'CREATE TEMPORARY TABLE ground_truth_import(' query += 'user_instance_id integer PRIMARY KEY, ' query += 'label varchar(200), ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'dataset_id integer DEFAULT ' + \ str(self.dataset_id) + ', ' query += 'id integer DEFAULT NULL' query += ');' cursor.execute(query) query = 'LOAD DATA LOCAL INFILE \'' + annotations_file + '\' ' query += 'INTO TABLE ' + 'ground_truth_import' + ' ' query += 'FIELDS TERMINATED BY \',\' ' query += 'IGNORE 1 LINES ' if families: query += '(user_instance_id, label, family) ' else: query += '(user_instance_id, label) ' query += ';' cursor.execute(query) query = 'UPDATE ground_truth_import t ' query += 'JOIN instances i ' query += 'ON i.user_instance_id = t.user_instance_id ' query += 'AND i.dataset_id = t.dataset_id ' query += 'SET t.id = i.id;' cursor.execute(query) query = 'INSERT INTO ground_truth(instance_id, dataset_id, label, family) ' query += 'SELECT t.id, t.dataset_id, t.label, t.family ' query += 'FROM ground_truth_import AS t;' cursor.execute(query) elif db_tools.isPostgresql(): query = 'CREATE TEMPORARY TABLE ground_truth_import(' query += 'user_instance_id integer PRIMARY KEY, ' query += 'label ground_truth_enum, ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'dataset_id integer DEFAULT ' + \ str(self.dataset_id) + ', ' query += 'id integer DEFAULT NULL' query += ');' cursor.execute(query) with open(annotations_file, 'r') as f: if families: query = 'COPY ground_truth_import(user_instance_id,label,family) ' else: query = 'COPY ground_truth_import(user_instance_id,label) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'UPDATE ground_truth_import AS t ' query += 'SET id = i.id ' query += 'FROM instances AS i ' query += 'WHERE i.user_instance_id = t.user_instance_id ' query += 'AND i.dataset_id = t.dataset_id;' cursor.execute(query) query = 'INSERT INTO ground_truth(instance_id, dataset_id, label, family) ' query += 'SELECT t.id, t.dataset_id, t.label, t.family ' query += 'FROM ground_truth_import AS t;' cursor.execute(query) self.session.commit()