Beispiel #1
0
 def checkInputProjectDatasetDir(self, logger):
     # Check project directory
     project_dir = dir_exp_tools.getProjectDirectory(
         self.secuml_conf, self.project)
     if not path.isdir(project_dir):
         raise ProjectDirNotFound(self.secuml_conf.input_data_dir,
                                  self.project)
     # Check dataset directory
     dataset_dir = dir_exp_tools.getDatasetDirectory(
         self.secuml_conf, self.project, self.dataset)
     if not path.isdir(dataset_dir):
         raise DatasetDirNotFound(self.secuml_conf.input_data_dir,
                                  self.project, self.dataset)
     # Check idents file
     self.idents_filename = dir_exp_tools.getIdentsFilename(
         self.secuml_conf, self.project, self.dataset)
     if not path.isfile(self.idents_filename):
         raise IdentsFileNotFound(self.idents_filename)
     # Check ground-truth file
     self.annotations_filename = dir_exp_tools.getGroundTruthFilename(
         self.secuml_conf, self.project, self.dataset)
     if not dir_tools.checkFileExists(self.annotations_filename):
         logger.warning('No ground-truth available for the dataset %s/%s.' %
                        (self.project, self.dataset))
         self.annotations_filename = None
Beispiel #2
0
 def saveAnnotatedInstances(self):
     filename = 'annotations_%s_exp%d_it%d.csv' % (
         self.experiment.query_strategy, self.experiment.experiment_id,
         self.iteration_number)
     filename = path.join(
         dir_exp_tools.getDatasetDirectory(self.experiment.secuml_conf,
                                           self.experiment.project,
                                           self.experiment.dataset),
         'annotations', filename)
     self.datasets.saveAnnotatedInstances(filename, self.experiment)
Beispiel #3
0
 def saveAnnotatedInstances(self):
     filename  = 'annotations_'
     filename += self.experiment.query_strategy + '_'
     filename += 'exp' + str(self.experiment.experiment_id) + '_'
     filename += 'it' + str(self.iteration_number) + '.csv'
     filename = path.join(dir_exp_tools.getDatasetDirectory(
                                 self.experiment.project,
                                 self.experiment.dataset),
                          'annotations',
                          filename)
     self.datasets.saveAnnotatedInstances(filename)
Beispiel #4
0
 def loadIdents(self):
     filename = path.join(dir_exp_tools.getDatasetDirectory(
                                    self.project,
                                    self.dataset),
                          'idents.csv')
     cursor = self.session.connection().connection.cursor()
     if db_tools.isMysql():
         query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' '
         query += 'INTO TABLE ' + 'instances' + ' '
         query += 'CHARACTER SET UTF8 '
         query += 'FIELDS TERMINATED BY \',\' '
         query += 'OPTIONALLY ENCLOSED BY \'"\' '
         query += 'IGNORE 1 LINES '
         query += 'SET dataset_id = ' + str(self.dataset_id) + ','
         query += 'row_number = NULL'
         query += ';'
         cursor.execute(query)
         query = 'SET @pos = 0;'
         cursor.execute(query)
         query = 'UPDATE instances SET row_number = '
         query += '( SELECT @pos := @pos + 1 ) WHERE dataset_id = ' + \
             str(self.dataset_id)
         query += ';'
         cursor.execute(query)
     elif db_tools.isPostgresql():
         timestamps = False
         with open(filename, 'r') as f:
             reader = csv.reader(f)
             header = next(reader)
             if len(header) == 3:
                 timestamps = True
         query = 'CREATE TEMPORARY TABLE instances_import('
         query += 'user_instance_id integer, '
         query += 'ident varchar(200), '
         query += 'timestamp timestamp DEFAULT null,'
         query += 'dataset_id integer DEFAULT ' + str(self.dataset_id) + ','
         query += 'row_number serial PRIMARY KEY'
         query += ');'
         cursor.execute(query)
         with open(filename, 'r') as f:
             if timestamps:
                 query = 'COPY instances_import(user_instance_id,ident,timestamp) '
             else:
                 query = 'COPY instances_import(user_instance_id,ident) '
             query += 'FROM STDIN '
             query += 'WITH CSV HEADER DELIMITER AS \',\' ;'
             cursor.copy_expert(sql=query, file=f)
         query = 'INSERT INTO instances(user_instance_id,ident,timestamp,dataset_id,row_number) '
         query += 'SELECT user_instance_id, ident, timestamp, dataset_id, row_number '
         query += 'FROM instances_import;'
         cursor.execute(query)
     self.session.commit()
Beispiel #5
0
 def getFeaturesFullpath(self):
     dataset_dir = dir_exp_tools.getDatasetDirectory(
         self.project, self.dataset)
     features_directory = path.join(dataset_dir, 'features')
     full_path = path.join(features_directory, self.features_filename)
     return full_path
Beispiel #6
0
    def _setAnnotationsFilename(self, annotations_filename):
        if annotations_filename is None:
            annotations_type = 'none'
        elif annotations_filename == 'ground_truth.csv':
            annotations_type = 'ground_truth'
        else:
            annotations_type = 'partial_annotations'

        exp_annotations = db_tables.ExperimentAnnotationsAlchemy(
            experiment_id=self.experiment_id,
            annotations_type=annotations_type)
        self.session.add(exp_annotations)
        self.session.commit()
        self.annotations_id = exp_annotations.annotations_id
        self.annotations_type = annotations_type

        if annotations_type == 'partial_annotations':
            filename = path.join(
                dir_exp_tools.getDatasetDirectory(self.project, self.dataset),
                'annotations', annotations_filename)
            if not dir_tools.checkFileExists(filename):
                raise ValueError('The annotation file %s does not exist.' %
                                 filename)
            # Check whether the file contains families
            families = False
            with open(filename, 'r') as f:
                reader = csv.reader(f)
                header = next(reader)
                if len(header) == 3:
                    families = True
            cursor = self.session.connection().connection.cursor()

            if db_tools.isMysql():
                query = 'CREATE TEMPORARY TABLE labels_import('
                query += 'instance_id integer, '
                query += 'annotations_id integer DEFAULT ' + \
                    str(self.annotations_id) + ', '
                query += 'user_instance_id integer, '
                query += 'label varchar(200), '
                query += 'family varchar(200) DEFAULT \'other\', '
                query += 'iteration integer DEFAULT 0, '
                query += 'method varchar(200) DEFAULT \'init\''
                query += ');'
                cursor.execute(query)

                query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' '
                query += 'INTO TABLE ' + 'labels_import' + ' '
                query += 'FIELDS TERMINATED BY \',\' '
                query += 'IGNORE 1 LINES '
                if families:
                    query += '(user_instance_id, label, family) '
                else:
                    query += '(user_instance_id, label) '
                query += ';'
                cursor.execute(query)

                query = 'UPDATE labels_import l '
                query += 'JOIN instances i '
                query += 'ON i.user_instance_id = l.user_instance_id '
                query += 'AND i.dataset_id = ' + str(self.dataset_id) + ' '
                query += 'SET l.instance_id = i.id;'
                cursor.execute(query)

                query = 'INSERT INTO annotations(instance_id,annotations_id,label,family,iteration,method) '
                query += 'SELECT instance_id,annotations_id,label,family,iteration,method '
                query += 'FROM labels_import;'
                cursor.execute(query)

            elif db_tools.isPostgresql():
                query = 'CREATE TEMPORARY TABLE labels_import('
                query += 'instance_id integer, '
                query += 'annotations_id integer DEFAULT ' + \
                    str(self.annotations_id) + ', '
                query += 'user_instance_id integer, '
                query += 'label labels_enum, '
                query += 'family varchar(200) DEFAULT \'other\', '
                query += 'iteration integer DEFAULT 0, '
                query += 'method varchar(200) DEFAULT \'init\''
                query += ');'
                cursor.execute(query)

                with open(filename, 'r') as f:
                    if families:
                        query = 'COPY labels_import(user_instance_id,label,family) '
                    else:
                        query = 'COPY labels_import(user_instance_id,label) '
                    query += 'FROM STDIN '
                    query += 'WITH CSV HEADER DELIMITER AS \',\' ;'
                    cursor.copy_expert(sql=query, file=f)

                query = 'UPDATE labels_import AS l '
                query += 'SET instance_id = i.id '
                query += 'FROM instances AS i '
                query += 'WHERE i.user_instance_id = l.user_instance_id '
                query += 'AND i.dataset_id = ' + str(self.dataset_id) + ';'
                cursor.execute(query)

                query = 'INSERT INTO annotations(instance_id,annotations_id,label,family,iteration,method) '
                query += 'SELECT instance_id,annotations_id,label,family,iteration,method '
                query += 'FROM labels_import;'
                cursor.execute(query)

            self.session.commit()
Beispiel #7
0
    def loadGroundTruth(self, logger):
        annotations_file = path.join(dir_exp_tools.getDatasetDirectory(
                                            self.project,
                                            self.dataset),
                                     'annotations',
                                     'ground_truth.csv')
        if not dir_tools.checkFileExists(annotations_file):
            logger.warning('No ground-truth available for this dataset')
            return

        # Check whether the file contains families
        families = False
        with open(annotations_file, 'r') as f:
            reader = csv.reader(f)
            header = next(reader)
            if len(header) == 3:
                families = True
        cursor = self.session.connection().connection.cursor()

        if db_tools.isMysql():
            query = 'CREATE TEMPORARY TABLE ground_truth_import('
            query += 'user_instance_id integer PRIMARY KEY, '
            query += 'label varchar(200), '
            query += 'family varchar(200) DEFAULT \'other\', '
            query += 'dataset_id integer DEFAULT ' + \
                str(self.dataset_id) + ', '
            query += 'id integer DEFAULT NULL'
            query += ');'
            cursor.execute(query)

            query = 'LOAD DATA LOCAL INFILE \'' + annotations_file + '\' '
            query += 'INTO TABLE ' + 'ground_truth_import' + ' '
            query += 'FIELDS TERMINATED BY \',\' '
            query += 'IGNORE 1 LINES '
            if families:
                query += '(user_instance_id, label, family) '
            else:
                query += '(user_instance_id, label) '
            query += ';'
            cursor.execute(query)

            query = 'UPDATE ground_truth_import t '
            query += 'JOIN instances i '
            query += 'ON i.user_instance_id = t.user_instance_id '
            query += 'AND i.dataset_id = t.dataset_id '
            query += 'SET t.id = i.id;'
            cursor.execute(query)

            query = 'INSERT INTO ground_truth(instance_id, dataset_id, label, family) '
            query += 'SELECT t.id, t.dataset_id, t.label, t.family '
            query += 'FROM ground_truth_import AS t;'
            cursor.execute(query)

        elif db_tools.isPostgresql():
            query = 'CREATE TEMPORARY TABLE ground_truth_import('
            query += 'user_instance_id integer PRIMARY KEY, '
            query += 'label ground_truth_enum, '
            query += 'family varchar(200) DEFAULT \'other\', '
            query += 'dataset_id integer DEFAULT ' + \
                str(self.dataset_id) + ', '
            query += 'id integer DEFAULT NULL'
            query += ');'
            cursor.execute(query)

            with open(annotations_file, 'r') as f:
                if families:
                    query = 'COPY ground_truth_import(user_instance_id,label,family) '
                else:
                    query = 'COPY ground_truth_import(user_instance_id,label) '
                query += 'FROM STDIN '
                query += 'WITH CSV HEADER DELIMITER AS \',\' ;'
                cursor.copy_expert(sql=query, file=f)

            query = 'UPDATE ground_truth_import AS t '
            query += 'SET id = i.id '
            query += 'FROM instances AS i '
            query += 'WHERE i.user_instance_id = t.user_instance_id '
            query += 'AND i.dataset_id = t.dataset_id;'
            cursor.execute(query)

            query = 'INSERT INTO ground_truth(instance_id, dataset_id, label, family) '
            query += 'SELECT t.id, t.dataset_id, t.label, t.family '
            query += 'FROM ground_truth_import AS t;'
            cursor.execute(query)

        self.session.commit()