def initFromFile(self, labels_filename): filename = dir_tools.getDatasetDirectory(self.project, self.dataset) filename += 'labels/' + labels_filename if not dir_tools.checkFileExists(filename): raise ValueError('The labels file %s does not exist.' % filename) ## Check whether the file contains families families = False with open(filename, 'r') as f: header = f.readline() fields = header.split(',') if len(fields) == 3: families = True query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' ' query += 'INTO TABLE ' + 'Labels' + ' ' query += 'FIELDS TERMINATED BY \',\' ' query += 'IGNORE 1 LINES ' if families: query += '(instance_id, label, family) ' else: query += '(instance_id, label) ' query += 'SET experiment_label_id = ' + str( self.experiment_label_id) + ', ' if not families: query += 'family = "other",' query += 'iteration = 0, ' query += 'method = "init", ' query += 'annotation = "0"' query += ';' self.cursor.execute(query) self.db.commit() self.checkLabelsValidity()
def getFeaturesFilesFullpaths(self): features_filenames = self.getFeaturesFilenames() features_directory = dir_tools.getDatasetDirectory( self.project, self.dataset) + 'features/' features_filenames = [ features_directory + f for f in features_filenames ] return features_filenames
def saveLabeledInstances(self): for i in ['annotations', 'labels']: filename = dir_tools.getDatasetDirectory( self.experiment.project, self.experiment.dataset) filename += 'labels/' + i + '_' filename += self.experiment.labeling_method + '_' filename += 'exp' + str(self.experiment.experiment_id) + '_' filename += 'it' + str(self.iteration_number) + '.csv' self.datasets.saveLabeledInstances(i, filename)
def loadIdents(self): idents_file = dir_tools.getDatasetDirectory( self.project, self.dataset) + 'idents.csv' fields = ['instance_id', 'ident', 'row_number'] types = [ 'INT', 'VARCHAR(200) CHARACTER SET utf8', 'INT NOT NULL AUTO_INCREMENT' ] mysql_tools.createTableFromFields(self.cursor, 'Idents', fields, types, ['row_number', 'instance_id']) mysql_tools.loadCsvFile(self.cursor, idents_file, 'Idents', ['row_number'])
def saveLabeledInstances(self, iteration_number): for i in ['annotations', 'labels']: filename = dir_tools.getDatasetDirectory(self.experiment.project, self.experiment.dataset) filename += 'labels/' + i + '_' filename += self.experiment.labeling_method + '_' filename += 'exp' + str(self.experiment.experiment_id) + '_' filename += 'it' + str(iteration_number) + '.csv' if i == 'annotations': instances = self.instances.getAnnotatedInstances() elif i == 'labels': instances = self.instances.getLabeledInstances() instances.saveInstancesLabels(filename)
def loadTrueLabels(self): labels_file = dir_tools.getDatasetDirectory(self.project, self.dataset) labels_file += 'labels/true_labels.csv' # Loads the true labels in the table TrueLabels if the file exists # Otherwise the table TrueLabels is not created if not dir_tools.checkFileExists(labels_file): print >> sys.stderr, 'No ground truth labels for this dataset' return exp = Experiment(self.project, self.dataset, self.db, self.cursor, experiment_name='true_labels') exp.initLabels('true_labels.csv')
def loadIdents(self): filename = dir_tools.getDatasetDirectory(self.project, self.dataset) filename += 'idents.csv' db, cursor = db_tools.getRawConnection() if db_tools.isMysql(): query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' ' query += 'INTO TABLE ' + 'instances' + ' ' query += 'CHARACTER SET UTF8 ' query += 'FIELDS TERMINATED BY \',\' ' query += 'OPTIONALLY ENCLOSED BY \'"\' ' query += 'IGNORE 1 LINES ' query += 'SET dataset_id = ' + str(self.dataset_id) + ',' query += 'row_number = NULL' query += ';' cursor.execute(query) query = 'SET @pos = 0;' cursor.execute(query) query = 'UPDATE instances SET row_number = ' query += '( SELECT @pos := @pos + 1 ) WHERE dataset_id = ' + str( self.dataset_id) query += ';' cursor.execute(query) elif db_tools.isPostgresql(): query = 'CREATE TEMPORARY TABLE instances_import(' query += 'user_instance_id integer, ' query += 'ident varchar(200), ' query += 'dataset_id integer DEFAULT ' + str(self.dataset_id) + ',' query += 'row_number serial PRIMARY KEY' query += ');' cursor.execute(query) with open(filename, 'r') as f: query = 'COPY instances_import(user_instance_id,ident) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'INSERT INTO instances(user_instance_id,ident,dataset_id,row_number) ' query += 'SELECT user_instance_id, ident, dataset_id, row_number ' query += 'FROM instances_import;' cursor.execute(query) db_tools.closeRawConnection(db, cursor)
def loadTrueLabels(self): labels_file = dir_tools.getDatasetDirectory(self.project, self.dataset) labels_file += 'labels/true_labels.csv' # Loads the true labels in the table TrueLabels if the file exists # Otherwise the table TrueLabels is not created if not dir_tools.checkFileExists(labels_file): print >> sys.stderr, 'No ground truth labels for this dataset' return ## Check whether the file contains families families = False with open(labels_file, 'r') as f: header = f.readline() fields = header.split(',') if len(fields) == 3: families = True db, cursor = db_tools.getRawConnection() if db_tools.isMysql(): query = 'CREATE TEMPORARY TABLE true_labels_import(' query += 'user_instance_id integer PRIMARY KEY, ' query += 'label varchar(200), ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'dataset_id integer DEFAULT ' + str( self.dataset_id) + ', ' query += 'id integer DEFAULT NULL' query += ');' cursor.execute(query) query = 'LOAD DATA LOCAL INFILE \'' + labels_file + '\' ' query += 'INTO TABLE ' + 'true_labels_import' + ' ' query += 'FIELDS TERMINATED BY \',\' ' query += 'IGNORE 1 LINES ' if families: query += '(user_instance_id, label, family) ' else: query += '(user_instance_id, label) ' query += ';' cursor.execute(query) query = 'UPDATE true_labels_import t ' query += 'JOIN instances i ' query += 'ON i.user_instance_id = t.user_instance_id ' query += 'AND i.dataset_id = t.dataset_id ' query += 'SET t.id = i.id;' cursor.execute(query) query = 'INSERT INTO true_labels(instance_id, dataset_id, label, family) ' query += 'SELECT t.id, t.dataset_id, t.label, t.family ' query += 'FROM true_labels_import AS t;' cursor.execute(query) elif db_tools.isPostgresql(): query = 'CREATE TEMPORARY TABLE true_labels_import(' query += 'user_instance_id integer PRIMARY KEY, ' query += 'label true_labels_enum, ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'dataset_id integer DEFAULT ' + str( self.dataset_id) + ', ' query += 'id integer DEFAULT NULL' query += ');' cursor.execute(query) with open(labels_file, 'r') as f: if families: query = 'COPY true_labels_import(user_instance_id,label,family) ' else: query = 'COPY true_labels_import(user_instance_id,label) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'UPDATE true_labels_import AS t ' query += 'SET id = i.id ' query += 'FROM instances AS i ' query += 'WHERE i.user_instance_id = t.user_instance_id ' query += 'AND i.dataset_id = t.dataset_id;' cursor.execute(query) query = 'INSERT INTO true_labels(instance_id, dataset_id, label, family) ' query += 'SELECT t.id, t.dataset_id, t.label, t.family ' query += 'FROM true_labels_import AS t;' cursor.execute(query) db_tools.closeRawConnection(db, cursor)
def initFromFile(self, labels_filename): if labels_filename is None: labels_type = 'none' elif labels_filename == 'true_labels.csv': labels_type = 'true_labels' else: labels_type = 'partial_labels' exp_labels = db_tables.ExperimentsLabelsAlchemy( experiment_id=self.experiment_id, labels_type=labels_type) self.session.add(exp_labels) self.session.commit() self.labels_id = exp_labels.labels_id self.labels_type = labels_type if labels_type == 'partial_labels': filename = dir_tools.getDatasetDirectory(self.project, self.dataset) filename += 'labels/' + labels_filename if not dir_tools.checkFileExists(filename): raise ValueError('The labels file %s does not exist.' % filename) ## Check whether the file contains families families = False with open(filename, 'r') as f: header = f.readline() fields = header.split(',') if len(fields) == 3: families = True db, cursor = db_tools.getRawConnection() if db_tools.isMysql(): query = 'CREATE TEMPORARY TABLE labels_import(' query += 'instance_id integer, ' query += 'labels_id integer DEFAULT ' + str( self.labels_id) + ', ' query += 'user_instance_id integer, ' query += 'label varchar(200), ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'iteration integer DEFAULT 0, ' query += 'method varchar(200) DEFAULT \'init\', ' query += 'annotation boolean DEFAULT True' query += ');' cursor.execute(query) query = 'LOAD DATA LOCAL INFILE \'' + filename + '\' ' query += 'INTO TABLE ' + 'labels_import' + ' ' query += 'FIELDS TERMINATED BY \',\' ' query += 'IGNORE 1 LINES ' if families: query += '(user_instance_id, label, family) ' else: query += '(user_instance_id, label) ' query += ';' cursor.execute(query) query = 'UPDATE labels_import l ' query += 'JOIN instances i ' query += 'ON i.user_instance_id = l.user_instance_id ' query += 'AND i.dataset_id = ' + str(self.dataset_id) + ' ' query += 'SET l.instance_id = i.id;' cursor.execute(query) query = 'INSERT INTO labels(instance_id,labels_id,label,family,iteration,method,annotation) ' query += 'SELECT instance_id,labels_id,label,family,iteration,method,annotation ' query += 'FROM labels_import;' cursor.execute(query) elif db_tools.isPostgresql(): query = 'CREATE TEMPORARY TABLE labels_import(' query += 'instance_id integer, ' query += 'labels_id integer DEFAULT ' + str( self.labels_id) + ', ' query += 'user_instance_id integer, ' query += 'label labels_enum, ' query += 'family varchar(200) DEFAULT \'other\', ' query += 'iteration integer DEFAULT 0, ' query += 'method varchar(200) DEFAULT \'init\', ' query += 'annotation boolean DEFAULT True' query += ');' cursor.execute(query) with open(filename, 'r') as f: if families: query = 'COPY labels_import(user_instance_id,label,family) ' else: query = 'COPY labels_import(user_instance_id,label) ' query += 'FROM STDIN ' query += 'WITH CSV HEADER DELIMITER AS \',\' ;' cursor.copy_expert(sql=query, file=f) query = 'UPDATE labels_import AS l ' query += 'SET instance_id = i.id ' query += 'FROM instances AS i ' query += 'WHERE i.user_instance_id = l.user_instance_id ' query += 'AND i.dataset_id = ' + str(self.dataset_id) + ';' cursor.execute(query) query = 'INSERT INTO labels(instance_id,labels_id,label,family,iteration,method,annotation) ' query += 'SELECT instance_id,labels_id,label,family,iteration,method,annotation ' query += 'FROM labels_import;' cursor.execute(query) db_tools.closeRawConnection(db, cursor) self.session.commit()