예제 #1
0
 def initFromFile(self, labels_filename):
     filename = dir_tools.getDatasetDirectory(self.project, self.dataset)
     filename += 'labels/' + labels_filename
     if not dir_tools.checkFileExists(filename):
         raise ValueError('The labels file %s does not exist.' % filename)
     ## Check whether the file contains families
     families = False
     with open(filename, 'r') as f:
         header = f.readline()
         fields = header.split(',')
         if len(fields) == 3:
             families = True
     query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' '
     query += 'INTO TABLE ' + 'Labels' + ' '
     query += 'FIELDS TERMINATED BY \',\' '
     query += 'IGNORE 1 LINES '
     if families:
         query += '(instance_id, label, family) '
     else:
         query += '(instance_id, label) '
     query += 'SET experiment_label_id = ' + str(
         self.experiment_label_id) + ', '
     if not families:
         query += 'family = "other",'
     query += 'iteration = 0, '
     query += 'method = "init", '
     query += 'annotation = "0"'
     query += ';'
     self.cursor.execute(query)
     self.db.commit()
     self.checkLabelsValidity()
예제 #2
0
 def getFeaturesFilesFullpaths(self):
     features_filenames = self.getFeaturesFilenames()
     features_directory = dir_tools.getDatasetDirectory(
         self.project, self.dataset) + 'features/'
     features_filenames = [
         features_directory + f for f in features_filenames
     ]
     return features_filenames
예제 #3
0
 def saveLabeledInstances(self):
     for i in ['annotations', 'labels']:
         filename  = dir_tools.getDatasetDirectory(
                 self.experiment.project,
                 self.experiment.dataset)
         filename += 'labels/' + i + '_'
         filename += self.experiment.labeling_method + '_'
         filename += 'exp' + str(self.experiment.experiment_id) + '_'
         filename += 'it' + str(self.iteration_number) + '.csv'
         self.datasets.saveLabeledInstances(i, filename)
예제 #4
0
 def loadIdents(self):
     idents_file = dir_tools.getDatasetDirectory(
         self.project, self.dataset) + 'idents.csv'
     fields = ['instance_id', 'ident', 'row_number']
     types = [
         'INT', 'VARCHAR(200) CHARACTER SET utf8',
         'INT NOT NULL AUTO_INCREMENT'
     ]
     mysql_tools.createTableFromFields(self.cursor, 'Idents', fields, types,
                                       ['row_number', 'instance_id'])
     mysql_tools.loadCsvFile(self.cursor, idents_file, 'Idents',
                             ['row_number'])
예제 #5
0
파일: Datasets.py 프로젝트: r3dlight/SecuML
 def saveLabeledInstances(self, iteration_number):
     for i in ['annotations', 'labels']:
         filename = dir_tools.getDatasetDirectory(self.experiment.project,
                                                  self.experiment.dataset)
         filename += 'labels/' + i + '_'
         filename += self.experiment.labeling_method + '_'
         filename += 'exp' + str(self.experiment.experiment_id) + '_'
         filename += 'it' + str(iteration_number) + '.csv'
         if i == 'annotations':
             instances = self.instances.getAnnotatedInstances()
         elif i == 'labels':
             instances = self.instances.getLabeledInstances()
         instances.saveInstancesLabels(filename)
예제 #6
0
 def loadTrueLabels(self):
     labels_file = dir_tools.getDatasetDirectory(self.project, self.dataset)
     labels_file += 'labels/true_labels.csv'
     # Loads the true labels in the table TrueLabels if the file exists
     # Otherwise the table TrueLabels is not created
     if not dir_tools.checkFileExists(labels_file):
         print >> sys.stderr, 'No ground truth labels for this dataset'
         return
     exp = Experiment(self.project,
                      self.dataset,
                      self.db,
                      self.cursor,
                      experiment_name='true_labels')
     exp.initLabels('true_labels.csv')
예제 #7
0
 def loadIdents(self):
     filename = dir_tools.getDatasetDirectory(self.project, self.dataset)
     filename += 'idents.csv'
     db, cursor = db_tools.getRawConnection()
     if db_tools.isMysql():
         query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' '
         query += 'INTO TABLE ' + 'instances' + ' '
         query += 'CHARACTER SET UTF8 '
         query += 'FIELDS TERMINATED BY \',\' '
         query += 'OPTIONALLY ENCLOSED BY \'"\' '
         query += 'IGNORE 1 LINES '
         query += 'SET dataset_id = ' + str(self.dataset_id) + ','
         query += 'row_number = NULL'
         query += ';'
         cursor.execute(query)
         query = 'SET @pos = 0;'
         cursor.execute(query)
         query = 'UPDATE instances SET row_number = '
         query += '( SELECT @pos := @pos + 1 ) WHERE dataset_id = ' + str(
             self.dataset_id)
         query += ';'
         cursor.execute(query)
     elif db_tools.isPostgresql():
         query = 'CREATE TEMPORARY TABLE instances_import('
         query += 'user_instance_id integer, '
         query += 'ident varchar(200), '
         query += 'dataset_id integer DEFAULT ' + str(self.dataset_id) + ','
         query += 'row_number serial PRIMARY KEY'
         query += ');'
         cursor.execute(query)
         with open(filename, 'r') as f:
             query = 'COPY instances_import(user_instance_id,ident) '
             query += 'FROM STDIN '
             query += 'WITH CSV HEADER DELIMITER AS \',\' ;'
             cursor.copy_expert(sql=query, file=f)
         query = 'INSERT INTO instances(user_instance_id,ident,dataset_id,row_number) '
         query += 'SELECT user_instance_id, ident, dataset_id, row_number '
         query += 'FROM instances_import;'
         cursor.execute(query)
     db_tools.closeRawConnection(db, cursor)
예제 #8
0
    def loadTrueLabels(self):
        labels_file = dir_tools.getDatasetDirectory(self.project, self.dataset)
        labels_file += 'labels/true_labels.csv'
        # Loads the true labels in the table TrueLabels if the file exists
        # Otherwise the table TrueLabels is not created
        if not dir_tools.checkFileExists(labels_file):
            print >> sys.stderr, 'No ground truth labels for this dataset'
            return

        ## Check whether the file contains families
        families = False
        with open(labels_file, 'r') as f:
            header = f.readline()
            fields = header.split(',')
            if len(fields) == 3:
                families = True
        db, cursor = db_tools.getRawConnection()

        if db_tools.isMysql():
            query = 'CREATE TEMPORARY TABLE true_labels_import('
            query += 'user_instance_id integer PRIMARY KEY, '
            query += 'label varchar(200), '
            query += 'family varchar(200) DEFAULT \'other\', '
            query += 'dataset_id integer DEFAULT ' + str(
                self.dataset_id) + ', '
            query += 'id integer DEFAULT NULL'
            query += ');'
            cursor.execute(query)

            query = 'LOAD DATA LOCAL INFILE \'' + labels_file + '\' '
            query += 'INTO TABLE ' + 'true_labels_import' + ' '
            query += 'FIELDS TERMINATED BY \',\' '
            query += 'IGNORE 1 LINES '
            if families:
                query += '(user_instance_id, label, family) '
            else:
                query += '(user_instance_id, label) '
            query += ';'
            cursor.execute(query)

            query = 'UPDATE true_labels_import t '
            query += 'JOIN instances i '
            query += 'ON i.user_instance_id = t.user_instance_id '
            query += 'AND i.dataset_id = t.dataset_id '
            query += 'SET t.id = i.id;'
            cursor.execute(query)

            query = 'INSERT INTO true_labels(instance_id, dataset_id, label, family) '
            query += 'SELECT t.id, t.dataset_id, t.label, t.family '
            query += 'FROM true_labels_import AS t;'
            cursor.execute(query)

        elif db_tools.isPostgresql():
            query = 'CREATE TEMPORARY TABLE true_labels_import('
            query += 'user_instance_id integer PRIMARY KEY, '
            query += 'label true_labels_enum, '
            query += 'family varchar(200) DEFAULT \'other\', '
            query += 'dataset_id integer DEFAULT ' + str(
                self.dataset_id) + ', '
            query += 'id integer DEFAULT NULL'
            query += ');'
            cursor.execute(query)

            with open(labels_file, 'r') as f:
                if families:
                    query = 'COPY true_labels_import(user_instance_id,label,family) '
                else:
                    query = 'COPY true_labels_import(user_instance_id,label) '
                query += 'FROM STDIN '
                query += 'WITH CSV HEADER DELIMITER AS \',\' ;'
                cursor.copy_expert(sql=query, file=f)

            query = 'UPDATE true_labels_import AS t '
            query += 'SET id = i.id '
            query += 'FROM instances AS i '
            query += 'WHERE i.user_instance_id = t.user_instance_id '
            query += 'AND i.dataset_id = t.dataset_id;'
            cursor.execute(query)

            query = 'INSERT INTO true_labels(instance_id, dataset_id, label, family) '
            query += 'SELECT t.id, t.dataset_id, t.label, t.family '
            query += 'FROM true_labels_import AS t;'
            cursor.execute(query)

        db_tools.closeRawConnection(db, cursor)
예제 #9
0
    def initFromFile(self, labels_filename):
        if labels_filename is None:
            labels_type = 'none'
        elif labels_filename == 'true_labels.csv':
            labels_type = 'true_labels'
        else:
            labels_type = 'partial_labels'

        exp_labels = db_tables.ExperimentsLabelsAlchemy(
            experiment_id=self.experiment_id, labels_type=labels_type)
        self.session.add(exp_labels)
        self.session.commit()
        self.labels_id = exp_labels.labels_id
        self.labels_type = labels_type

        if labels_type == 'partial_labels':
            filename = dir_tools.getDatasetDirectory(self.project,
                                                     self.dataset)
            filename += 'labels/' + labels_filename
            if not dir_tools.checkFileExists(filename):
                raise ValueError('The labels file %s does not exist.' %
                                 filename)
            ## Check whether the file contains families
            families = False
            with open(filename, 'r') as f:
                header = f.readline()
                fields = header.split(',')
                if len(fields) == 3:
                    families = True
            db, cursor = db_tools.getRawConnection()

            if db_tools.isMysql():
                query = 'CREATE TEMPORARY TABLE labels_import('
                query += 'instance_id integer, '
                query += 'labels_id integer DEFAULT ' + str(
                    self.labels_id) + ', '
                query += 'user_instance_id integer, '
                query += 'label varchar(200), '
                query += 'family varchar(200) DEFAULT \'other\', '
                query += 'iteration integer DEFAULT 0, '
                query += 'method varchar(200) DEFAULT \'init\', '
                query += 'annotation boolean DEFAULT True'
                query += ');'
                cursor.execute(query)

                query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' '
                query += 'INTO TABLE ' + 'labels_import' + ' '
                query += 'FIELDS TERMINATED BY \',\' '
                query += 'IGNORE 1 LINES '
                if families:
                    query += '(user_instance_id, label, family) '
                else:
                    query += '(user_instance_id, label) '
                query += ';'
                cursor.execute(query)

                query = 'UPDATE labels_import l '
                query += 'JOIN instances i '
                query += 'ON i.user_instance_id = l.user_instance_id '
                query += 'AND i.dataset_id = ' + str(self.dataset_id) + ' '
                query += 'SET l.instance_id = i.id;'
                cursor.execute(query)

                query = 'INSERT INTO labels(instance_id,labels_id,label,family,iteration,method,annotation) '
                query += 'SELECT instance_id,labels_id,label,family,iteration,method,annotation '
                query += 'FROM labels_import;'
                cursor.execute(query)

            elif db_tools.isPostgresql():
                query = 'CREATE TEMPORARY TABLE labels_import('
                query += 'instance_id integer, '
                query += 'labels_id integer DEFAULT ' + str(
                    self.labels_id) + ', '
                query += 'user_instance_id integer, '
                query += 'label labels_enum, '
                query += 'family varchar(200) DEFAULT \'other\', '
                query += 'iteration integer DEFAULT 0, '
                query += 'method varchar(200) DEFAULT \'init\', '
                query += 'annotation boolean DEFAULT True'
                query += ');'
                cursor.execute(query)

                with open(filename, 'r') as f:
                    if families:
                        query = 'COPY labels_import(user_instance_id,label,family) '
                    else:
                        query = 'COPY labels_import(user_instance_id,label) '
                    query += 'FROM STDIN '
                    query += 'WITH CSV HEADER DELIMITER AS \',\' ;'
                    cursor.copy_expert(sql=query, file=f)

                query = 'UPDATE labels_import AS l '
                query += 'SET instance_id = i.id '
                query += 'FROM instances AS i '
                query += 'WHERE i.user_instance_id = l.user_instance_id '
                query += 'AND i.dataset_id = ' + str(self.dataset_id) + ';'
                cursor.execute(query)

                query = 'INSERT INTO labels(instance_id,labels_id,label,family,iteration,method,annotation) '
                query += 'SELECT instance_id,labels_id,label,family,iteration,method,annotation '
                query += 'FROM labels_import;'
                cursor.execute(query)

            db_tools.closeRawConnection(db, cursor)
            self.session.commit()