Ejemplo n.º 1
0
def ImportRefGenomeSummaryData(calculationObject, datasetId, folder,
                               importSettings):
    if not os.path.exists(os.path.join(folder, 'summaryvalues')):
        return

    calculationObject.credentialInfo.VerifyCanDo(
        DQXDbTools.DbOperationWrite(datasetId, 'summaryvalues'))

    summaryids = []
    for dir in os.listdir(os.path.join(folder, 'summaryvalues')):
        if os.path.isdir(os.path.join(folder, 'summaryvalues', dir)):
            summaryids.append(dir)
    for summaryid in summaryids:
        with calculationObject.LogHeader(
                'Importing reference genome summary data ' + summaryid):
            DQXUtils.CheckValidIdentifier(summaryid)
            destFolder = os.path.join(config.BASEDIR, 'SummaryTracks',
                                      datasetId, summaryid)
            if not os.path.exists(destFolder):
                os.makedirs(destFolder)
            dataFileName = os.path.join(destFolder, summaryid)
            shutil.copyfile(
                os.path.join(folder, 'summaryvalues', summaryid, 'values'),
                dataFileName)

            settings = SettingsLoader.SettingsLoader(
                os.path.join(folder, 'summaryvalues', summaryid, 'settings'))
            settings.RequireTokens(
                ['Name', 'MaxVal', 'MaxVal', 'BlockSizeMax'])
            settings.AddTokenIfMissing('MinVal', 0)
            settings.AddTokenIfMissing('BlockSizeMin', 1)
            settings.AddTokenIfMissing('ChannelColor', 'rgb(0,0,0)')
            settings.AddTokenIfMissing('Order', 99999)
            settings.DefineKnownTokens(['channelColor'])
            settings.AddTokenIfMissing('ScopeStr', importSettings['ScopeStr'])
            print('SETTINGS: ' + settings.ToJSON())
            if importSettings['ScopeStr'] == 'all':
                print('Executing filter bank')
                ImpUtils.ExecuteFilterbankSummary_Value(
                    calculationObject, destFolder, summaryid, settings)
            else:
                calculationObject.Log(
                    'WARNING: Skipping filterbanking genome summary data')
            extraSettings = settings.Clone()
            extraSettings.DropTokens([
                'Name', 'Order', 'MinVal', 'MaxVal', 'BlockSizeMin',
                'BlockSizeMax'
            ])
            sql = "INSERT INTO summaryvalues VALUES ('', 'fixed', '{0}', '-', '{1}', {2}, '{3}', {4}, {5}, {6})".format(
                summaryid, settings['Name'], settings['Order'],
                extraSettings.ToJSON(), settings['MinVal'], settings['MaxVal'],
                settings['BlockSizeMin'])
            ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
Ejemplo n.º 2
0
def ImportDataTable(calculation_object, dataset_id, tableid, folder, import_settings):
    global tableOrder, property_order
    with calculation_object.LogHeader('Importing 2D datatable {0}'.format(tableid)):
        print('Source: ' + folder)
        DQXUtils.CheckValidIdentifier(tableid)

        table_settings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings')))
        table_settings.RequireTokens(['NameSingle', 'NamePlural', 'FirstArrayDimension'])
        table_settings.AddTokenIfMissing('ShowInGenomeBrowser', False)
        table_settings.AddTokenIfMissing('ColumnDataTable', '')
        table_settings.AddTokenIfMissing('RowDataTable', '')
        extra_settings = table_settings.Clone()
        extra_settings.DropTokens(['ColumnDataTable',
                                  'ColumnIndexField',
                                  'RowDataTable',
                                  'RowIndexField',
                                  'Properties'])

        remote_hdf5 = h5py.File(os.path.join(folder, 'data.hdf5'), 'r')
        #Check that the referenced tables exist and have the primary key specified.
        if table_settings['ColumnDataTable']:
            sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['ColumnDataTable'])
            id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            try:
                id = id[0][0]
            except IndexError:
                raise Exception("Index Table " + table_settings['ColumnDataTable'] + " doesn't exist")
            sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['ColumnIndexField'],
                                                       table_settings['ColumnDataTable'])
            try:
                field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            except:
                raise Exception(table_settings['ColumnIndexField'] + " column index field doesn't exist in table " + table_settings['ColumnDataTable'])
        if table_settings['RowDataTable']:
            sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['RowDataTable'])
            id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            try:
                id = id[0][0]
            except IndexError:
                raise Exception("Index Table " + table_settings['RowDataTable'] + " doesn't exist")
            sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['RowIndexField'],
                                                       table_settings['RowDataTable'])
            try:
                field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            except:
                raise Exception(table_settings['RowIndexField'] + " row index field doesn't exist in table " + table_settings['RowDataTable'])

        if table_settings['ShowInGenomeBrowser']:
            sql = "SELECT IsPositionOnGenome FROM tablecatalog WHERE id='{0}' ".format(table_settings['ColumnDataTable'])
            is_position = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)[0][0]
            if not is_position:
                raise Exception(table_settings['ColumnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.')
        if table_settings['FirstArrayDimension'] not in ['column', 'row']:
            raise Exception("FirstArrayDimension must be column or row")

        # Add to tablecatalog
        extra_settings.ConvertStringsToSafeSQL()
        sql = "INSERT INTO 2D_tablecatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6})".format(
            tableid,
            table_settings['NamePlural'],
            table_settings['ColumnDataTable'],
            table_settings['RowDataTable'],
            table_settings['FirstArrayDimension'],
            extra_settings.ToJSON(),
            tableOrder
        )
        ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
        tableOrder += 1

        for property in table_settings['Properties']:
            extra_settings = copy.deepcopy(property)
            dtype = arraybuffer._strict_dtype_string(remote_hdf5[property['Id']].dtype)
            del extra_settings['Id']
            del extra_settings['Name']
            sql = "INSERT INTO 2D_propertycatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', {5}, '{6}', '{7}')".format(
                property['Id'],
                tableid,
                table_settings['ColumnDataTable'],
                table_settings['RowDataTable'],
                property['Name'],
                property_order,
                dtype,
                simplejson.dumps(extra_settings)
            )
            ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
            property_order += 1

        if not import_settings['ConfigOnly']:
            #Insert an index column into the index tables
            if table_settings['ColumnDataTable']:
                if table_settings.HasToken('ColumnIndexArray'):
                    #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL
                    #Firstly create a temporay table with the index array
                    try:
                        column_index = remote_hdf5[table_settings['ColumnIndexArray']]
                    except KeyError:
                        raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['ColumnIndexArray']))
                    for property in table_settings['Properties']:
                        if len(column_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'column' else 1]:
                            raise Exception("Property {0} has a different column length to the column index".format(property))
                    sql = ImpUtils.Numpy_to_SQL().create_table('TempColIndex', table_settings['ColumnIndexField'], column_index)
                    ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql)

                    #We have a datatable - add an index to it then copy that index across to the data table
                    sql = """ALTER TABLE `TempColIndex` ADD `index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `TempColIndex` SET `index` = @i:=@i+1;
                             ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;
                             UPDATE `{0}` INNER JOIN `TempColIndex` ON `{0}`.`{1}` = `TempColIndex`.`{1}` SET `{0}`.`{2}_column_index` = `TempColIndex`.`index`;
                             DROP TABLE `TempColIndex`""".format(
                        table_settings['ColumnDataTable'],
                        table_settings['ColumnIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
                    #Now check we have no NULLS
                    sql = "SELECT `{1}_column_index` from `{0}` where `{1}_column_index` IS NULL".format(
                        table_settings['ColumnDataTable'],
                        tableid)
                    nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
                    if len(nulls) > 0:
                        raise Exception("Not all rows in {0} have a corresponding column in 2D datatable {1}".format(table_settings['ColumnDataTable'], tableid))
                else:
                    #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "ColumnIndexField" so we index by that column in order
                    sql = """ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}`;
                             """.format(
                        table_settings['ColumnDataTable'],
                        table_settings['ColumnIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)

            if table_settings['RowDataTable']:
                if table_settings.HasToken('RowIndexArray'):
                    #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL
                    #Firstly create a temporay table with the index array
                    try:
                        row_index = remote_hdf5[table_settings['RowIndexArray']]
                    except KeyError:
                        raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['RowIndexArray']))
                    for property in table_settings['Properties']:
                        if len(row_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'row' else 1]:
                            raise Exception("Property {0} has a different row length to the row index".format(property))
                    sql = ImpUtils.Numpy_to_SQL().create_table('TempRowIndex', table_settings['RowIndexField'], row_index)
                    ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql)

                    #We have a datatable - add an index to it then copy that index across to the data table
                    sql = """ALTER TABLE `TempRowIndex` ADD `index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `TempRowIndex` SET `index` = @i:=@i+1;
                             ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL;
                             UPDATE `{0}` INNER JOIN `TempRowIndex` ON `{0}`.`{1}` = `TempRowIndex`.`{1}` SET `{0}`.`{2}_row_index` = `TempRowIndex`.`index`;
                             DROP TABLE `TempRowIndex`""".format(
                        table_settings['RowDataTable'],
                        table_settings['RowIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
                    #Now check we have no NULLS
                    sql = "SELECT `{1}_row_index` from `{0}` where `{1}_row_index` IS NULL".format(
                        table_settings['RowDataTable'],
                        tableid)
                    nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
                    if len(nulls) > 0:
                        raise Exception("Not all rows in {0} have a corresponding row in 2D datatable {1}".format(table_settings['RowDataTable'], tableid))
                else:
                    #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "RowIndexField" so we index by that column in order
                    sql = """ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `{0}` SET `{2}_row_index` = @i:=@i+1 ORDER BY `{1}`;
                             """.format(
                        table_settings['RowDataTable'],
                        table_settings['RowIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)


            #We have the indexes - now we need a local copy of the HDF5 data for each property
            ImpUtils.mkdir(os.path.join(config.BASEDIR, '2D_data'))
            local_hdf5 = h5py.File(os.path.join(config.BASEDIR, '2D_data', dataset_id+'_'+tableid+'.hdf5'), 'w', libver='latest')
            print "Copying HDF5 datasets"
            for property in table_settings['Properties']:
                print "..", property
                local_hdf5.copy(remote_hdf5[property['Id']], property['Id'])
                print "done"
            print "all copies complete"
            local_hdf5.close()
            remote_hdf5.close()
Ejemplo n.º 3
0
def ImportDataTable(calculation_object, dataset_id, tableid, folder, import_settings):
    global tableOrder, property_order
    with calculation_object.LogHeader('Importing 2D datatable {0}'.format(tableid)):
        print('Source: ' + folder)
        DQXUtils.CheckValidIdentifier(tableid)

        table_settings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings')))
        table_settings.RequireTokens(['NameSingle', 'NamePlural', 'FirstArrayDimension'])
        table_settings.AddTokenIfMissing('ShowInGenomeBrowser', False)
        table_settings.AddTokenIfMissing('ColumnDataTable', '')
        table_settings.AddTokenIfMissing('RowDataTable', '')
        extra_settings = table_settings.Clone()
        extra_settings.DropTokens(['ColumnDataTable',
                                  'ColumnIndexField',
                                  'RowDataTable',
                                  'RowIndexField',
                                  'Properties'])

        remote_hdf5 = h5py.File(os.path.join(folder, 'data.hdf5'), 'r')
        #Check that the referenced tables exist and have the primary key specified.
        if table_settings['ColumnDataTable']:
            sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['ColumnDataTable'])
            id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            try:
                id = id[0][0]
            except IndexError:
                raise Exception("Index Table " + table_settings['ColumnDataTable'] + " doesn't exist")
            sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['ColumnIndexField'],
                                                       table_settings['ColumnDataTable'])
            try:
                field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            except:
                raise Exception(table_settings['ColumnIndexField'] + " column index field doesn't exist in table " + table_settings['ColumnDataTable'])
        if table_settings['RowDataTable']:
            sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['RowDataTable'])
            id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            try:
                id = id[0][0]
            except IndexError:
                raise Exception("Index Table " + table_settings['RowDataTable'] + " doesn't exist")
            sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['RowIndexField'],
                                                       table_settings['RowDataTable'])
            try:
                field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
            except:
                raise Exception(table_settings['RowIndexField'] + " row index field doesn't exist in table " + table_settings['RowDataTable'])

        if table_settings['ShowInGenomeBrowser']:
            sql = "SELECT IsPositionOnGenome FROM tablecatalog WHERE id='{0}' ".format(table_settings['ColumnDataTable'])
            is_position = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)[0][0]
            if not is_position:
                raise Exception(table_settings['ColumnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.')
        if table_settings['FirstArrayDimension'] not in ['column', 'row']:
            raise Exception("FirstArrayDimension must be column or row")

        # Add to tablecatalog
        extra_settings.ConvertStringsToSafeSQL()
        sql = "INSERT INTO 2D_tablecatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6})".format(
            tableid,
            table_settings['NamePlural'],
            table_settings['ColumnDataTable'],
            table_settings['RowDataTable'],
            table_settings['FirstArrayDimension'],
            extra_settings.ToJSON(),
            tableOrder
        )
        ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
        tableOrder += 1

        for property in table_settings['Properties']:
            extra_settings = copy.deepcopy(property)
            dtype = arraybuffer._strict_dtype_string(remote_hdf5[property['Id']].dtype)
            del extra_settings['Id']
            del extra_settings['Name']
            sql = "INSERT INTO 2D_propertycatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', {5}, '{6}', '{7}')".format(
                property['Id'],
                tableid,
                table_settings['ColumnDataTable'],
                table_settings['RowDataTable'],
                property['Name'],
                property_order,
                dtype,
                simplejson.dumps(extra_settings)
            )
            ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
            property_order += 1

        if not import_settings['ConfigOnly']:
            #Insert an index column into the index tables
            if table_settings['ColumnDataTable']:
                if table_settings.HasToken('ColumnIndexArray'):
                    #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL
                    #Firstly create a temporay table with the index array
                    try:
                        column_index = remote_hdf5[table_settings['ColumnIndexArray']]
                    except KeyError:
                        raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['ColumnIndexArray']))
                    for property in table_settings['Properties']:
                        if len(column_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'column' else 1]:
                            raise Exception("Property {0} has a different column length to the column index".format(property))
                    sql = ImpUtils.Numpy_to_SQL().create_table('TempColIndex', table_settings['ColumnIndexField'], column_index)
                    ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql)

                    #We have a datatable - add an index to it then copy that index across to the data table
                    sql = """ALTER TABLE `TempColIndex` ADD `index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `TempColIndex` SET `index` = @i:=@i+1;
                             ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;
                             UPDATE `{0}` INNER JOIN `TempColIndex` ON `{0}`.`{1}` = `TempColIndex`.`{1}` SET `{0}`.`{2}_column_index` = `TempColIndex`.`index`;
                             DROP TABLE `TempColIndex`""".format(
                        table_settings['ColumnDataTable'],
                        table_settings['ColumnIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
                    #Now check we have no NULLS
                    sql = "SELECT `{1}_column_index` from `{0}` where `{1}_column_index` IS NULL".format(
                        table_settings['ColumnDataTable'],
                        tableid)
                    nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
                    if len(nulls) > 0:
                        raise Exception("Not all rows in {0} have a corresponding column in 2D datatable {1}".format(table_settings['ColumnDataTable'], tableid))
                else:
                    #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "ColumnIndexField" so we index by that column in order
                    sql = """ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}`;
                             """.format(
                        table_settings['ColumnDataTable'],
                        table_settings['ColumnIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)

            if table_settings['RowDataTable']:
                if table_settings.HasToken('RowIndexArray'):
                    #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL
                    #Firstly create a temporay table with the index array
                    try:
                        row_index = remote_hdf5[table_settings['RowIndexArray']]
                    except KeyError:
                        raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['RowIndexArray']))
                    for property in table_settings['Properties']:
                        if len(row_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'row' else 1]:
                            raise Exception("Property {0} has a different row length to the row index".format(property))
                    sql = ImpUtils.Numpy_to_SQL().create_table('TempRowIndex', table_settings['RowIndexField'], row_index)
                    ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql)

                    #We have a datatable - add an index to it then copy that index across to the data table
                    sql = """ALTER TABLE `TempRowIndex` ADD `index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `TempRowIndex` SET `index` = @i:=@i+1;
                             ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL;
                             UPDATE `{0}` INNER JOIN `TempRowIndex` ON `{0}`.`{1}` = `TempRowIndex`.`{1}` SET `{0}`.`{2}_row_index` = `TempRowIndex`.`index`;
                             DROP TABLE `TempRowIndex`""".format(
                        table_settings['RowDataTable'],
                        table_settings['RowIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)
                    #Now check we have no NULLS
                    sql = "SELECT `{1}_row_index` from `{0}` where `{1}_row_index` IS NULL".format(
                        table_settings['RowDataTable'],
                        tableid)
                    nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)
                    if len(nulls) > 0:
                        raise Exception("Not all rows in {0} have a corresponding row in 2D datatable {1}".format(table_settings['RowDataTable'], tableid))
                else:
                    #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "RowIndexField" so we index by that column in order
                    sql = """ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL;
                             SELECT @i:=-1;UPDATE `{0}` SET `{2}_row_index` = @i:=@i+1 ORDER BY `{1}`;
                             """.format(
                        table_settings['RowDataTable'],
                        table_settings['RowIndexField'],
                        tableid)
                    ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql)


            #We have the indexes - now we need a local copy of the HDF5 data for each property
            ImpUtils.mkdir(os.path.join(config.BASEDIR, '2D_data'))
            local_hdf5 = h5py.File(os.path.join(config.BASEDIR, '2D_data', dataset_id+'_'+tableid+'.hdf5'), 'w', libver='latest')
            print "Copying HDF5 datasets"
            for property in table_settings['Properties']:
                print "..", property
                local_hdf5.copy(remote_hdf5[property['Id']], property['Id'])
                print "done"
            print "all copies complete"
            local_hdf5.close()
            remote_hdf5.close()
Ejemplo n.º 4
0
def ImportDataTable(calculationObject, datasetId, tableid, folder, importSettings):
    Utils.CheckSafeIdentifier(tableid)
    global tableOrder
    with calculationObject.LogHeader('Importing datatable {0}'.format(tableid)):
        print('Source: ' + folder)
        DQXUtils.CheckValidIdentifier(tableid)

        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablecatalog'))
        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog'))
        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'relations'))
        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablebasedsummaryvalues'))

        tableSettings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings')))
        tableSettings.RequireTokens(['NameSingle', 'NamePlural', 'PrimKey'])
        tableSettings.AddTokenIfMissing('IsPositionOnGenome', False)
        tableSettings.AddTokenIfMissing('IsRegionOnGenome', False)
        tableSettings.AddTokenIfMissing('MaxTableSize', None)
        tableSettings.AddTokenIfMissing('MaxCountQueryRecords', 200000)
        tableSettings.AddTokenIfMissing('MaxCountQueryAggregated', 1000000)
        tableSettings.AddTokenIfMissing('AllowSubSampling', False)
        extraSettings = tableSettings.Clone()
        extraSettings.DropTokens(['PrimKey', 'Properties'])

        if tableSettings['MaxTableSize'] is not None:
            print('WARNING: table size limited to '+str(tableSettings['MaxTableSize']))

        # Drop existing tablecatalog record
        sql = "DELETE FROM tablecatalog WHERE id='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)

        # Add to tablecatalog
        extraSettings.ConvertStringsToSafeSQL()
        sql = "INSERT INTO tablecatalog VALUES ('{0}', '{1}', '{2}', {3}, '{4}', {5})".format(
            tableid,
            tableSettings['NamePlural'],
            tableSettings['PrimKey'],
            tableSettings['IsPositionOnGenome'],
            extraSettings.ToJSON(),
            tableOrder
        )
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
        tableOrder += 1

        properties = ImpUtils.LoadPropertyInfo(calculationObject, tableSettings, os.path.join(folder, 'data'))

        sql = "DELETE FROM propertycatalog WHERE tableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
        sql = "DELETE FROM relations WHERE childtableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)

        ranknr = 0
        for property in properties:
            propid = property['propid']
            settings = property['Settings']
            extraSettings = settings.Clone()
            extraSettings.DropTokens(['Name', 'DataType', 'Order'])
            sql = "INSERT INTO propertycatalog VALUES ('', 'fixed', '{0}', '{1}', '{2}', '{3}', {4}, '{5}')".format(
                settings['DataType'],
                propid,
                tableid,
                settings['Name'],
                0,
                extraSettings.ToJSON()
            )
            ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            if settings.HasToken('Relation'):
                relationSettings = settings.GetSubSettings('Relation')
                calculationObject.Log('Creating relation: '+relationSettings.ToJSON())
                relationSettings.RequireTokens(['TableId'])
                relationSettings.AddTokenIfMissing('ForwardName', 'belongs to')
                relationSettings.AddTokenIfMissing('ReverseName', 'has')
                sql = "INSERT INTO relations VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}')".format(
                    tableid,
                    propid,
                    relationSettings['TableId'],
                    '',
                    relationSettings['ForwardName'],
                    relationSettings['ReverseName']
                )
                ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            ranknr += 1



        propidList = []
        propDict = {}
        for property in properties:
            propDict[property['propid']] = property
            propidList.append(property['propid'])

        if tableSettings['IsPositionOnGenome']:
            if not tableSettings.HasToken('Chromosome'):
                raise Exception('Missing settings tag Chromosome in genome position datatable.')
            if not tableSettings.HasToken('Position'):
                raise Exception('Missing settings tag Position in genome position datatable.')
            if tableSettings['Chromosome'] not in propDict:
                 raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome']))
            if tableSettings['Position'] not in propDict:
                 raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Position']))

        if tableSettings['IsRegionOnGenome']:
            if not tableSettings.HasToken('Chromosome'):
                raise Exception('Missing setting "Chromosome" in genome region datatable {0}'.format(tableid))
            if not tableSettings.HasToken('RegionStart'):
                raise Exception('Missing setting "RegionStart" in genome region datatable {0}'.format(tableid))
            if not tableSettings.HasToken('RegionStop'):
                raise Exception('Missing setting "RegionStop" in genome region datatable {0}'.format(tableid))
            if tableSettings['Chromosome'] not in propDict:
                 raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome']))
            if tableSettings['RegionStart'] not in propDict:
                 raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStart']))
            if tableSettings['RegionStop'] not in propDict:
                 raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStop']))
            propDict[tableSettings['Chromosome']]['Settings'].SetToken('Index', True)
            propDict[tableSettings['RegionStart']]['Settings'].SetToken('Index', True)
            propDict[tableSettings['RegionStop']]['Settings'].SetToken('Index', True)


        if not importSettings['ConfigOnly']:
            columns = [ {
                            'name': prop['propid'],
                            'DataType': prop['DataType'],
                            'Index': prop['Settings']['Index'],
                            'ReadData': prop['Settings']['ReadData']
                        }
                        for prop in properties if (prop['propid'] != 'AutoKey')]
            LoadTable.LoadTable(
                calculationObject,
                os.path.join(folder, 'data'),
                datasetId,
                tableid,
                columns,
                tableSettings,
                importSettings,
                tableSettings['AllowSubSampling']
            )
            if tableSettings['IsPositionOnGenome']:
                calculationObject.Log('Indexing chromosome')
                scr = ImpUtils.SQLScript(calculationObject)
                scr.AddCommand('create index {0}_chrompos ON {0}({1},{2})'.format(
                    tableid,
                    tableSettings['Chromosome'],
                    tableSettings['Position']
                ))
                scr.Execute(datasetId)



        print('Creating summary values')
        sql = "DELETE FROM summaryvalues WHERE tableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
        for property in properties:
            propid = property['propid']
            settings = property['Settings']

            if (settings.HasToken('SummaryValues')) and ImpUtils.IsValueDataTypeIdenfifier(property['DataType']):
                with calculationObject.LogHeader('Creating numerical summary values for {0}.{1}'.format(tableid,propid)):
                    summSettings = settings.GetSubSettings('SummaryValues')
                    if settings.HasToken('minval'):
                        summSettings.AddTokenIfMissing('MinVal', settings['minval'])
                    if settings.HasToken('maxval'):
                        summSettings.AddTokenIfMissing('MaxVal', settings['maxval'])
                    sourceFileName = os.path.join(folder, 'data')
                    destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid)
                    if not os.path.exists(destFolder):
                        os.makedirs(destFolder)
                    dataFileName = os.path.join(destFolder, propid)
                    if not importSettings['ConfigOnly']:
                        ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings)
                    ImpUtils.CreateSummaryValues_Value(
                        calculationObject,
                        summSettings,
                        datasetId,
                        tableid,
                        'fixed',
                        '',
                        propid,
                        settings['Name'],
                        dataFileName,
                        importSettings
                    )

            if (settings.HasToken('SummaryValues')) and (property['DataType'] == 'Text'):
                with calculationObject.LogHeader('Creating categorical summary values for {0}.{1}'.format(tableid,propid)):
                    summSettings = settings.GetSubSettings('SummaryValues')
                    sourceFileName = os.path.join(folder, 'data')
                    destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid)
                    if not os.path.exists(destFolder):
                        os.makedirs(destFolder)
                    dataFileName = os.path.join(destFolder, propid)
                    if not importSettings['ConfigOnly']:
                        ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings)
                    categories = []
                    if settings.HasToken('categoryColors'):
                        stt = settings.GetSubSettings('categoryColors')
                        categories = [x for x in stt.Get()]
                        summSettings.AddTokenIfMissing('Categories', categories)
                    ImpUtils.CreateSummaryValues_Categorical(
                        calculationObject,
                        summSettings,
                        datasetId,
                        tableid,
                        'fixed',
                        '',
                        propid,
                        settings['Name'],
                        dataFileName,
                        importSettings
                    )

        sql = "DELETE FROM tablebasedsummaryvalues WHERE tableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)

        if tableSettings.HasToken('TableBasedSummaryValues'):
            calculationObject.Log('Processing table-based summary values')
            if not type(tableSettings['TableBasedSummaryValues']) is list:
                raise Exception('TableBasedSummaryValues token should be a list')
            for stt in tableSettings['TableBasedSummaryValues']:
                summSettings = SettingsLoader.SettingsLoader()
                summSettings.LoadDict(stt)
                summSettings.RequireTokens(['Id', 'Name', 'MaxVal', 'BlockSizeMax'])
                summSettings.AddTokenIfMissing('MinVal', 0)
                summSettings.AddTokenIfMissing('BlockSizeMin', 1)
                summSettings.DefineKnownTokens(['channelColor'])
                summaryid = summSettings['Id']
                with calculationObject.LogHeader('Table based summary value {0}, {1}'.format(tableid, summaryid)):
                    extraSummSettings = summSettings.Clone()
                    extraSummSettings.DropTokens(['Id', 'Name', 'MinVal', 'MaxVal', 'BlockSizeMin', 'BlockSizeMax'])
                    sql = "INSERT INTO tablebasedsummaryvalues VALUES ('{0}', '{1}', '{2}', '{3}', {4}, {5}, {6}, 0)".format(
                        tableid,
                        summaryid,
                        summSettings['Name'],
                        extraSummSettings.ToJSON(),
                        summSettings['MinVal'],
                        summSettings['MaxVal'],
                        summSettings['BlockSizeMin']
                    )
                    ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
                    if importSettings['ScopeStr'] == 'all':
                        itemtracknr = 0
                        for fileid in os.listdir(os.path.join(folder, summaryid)):
                            if not(os.path.isdir(os.path.join(folder, summaryid, fileid))):
                                itemtracknr += 1
                                calculationObject.Log('Processing {0}: {1}'.format(itemtracknr, fileid))
                                destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, 'TableTracks', tableid, summaryid, fileid)
                                calculationObject.Log('Destination: '+destFolder)
                                if not os.path.exists(destFolder):
                                    os.makedirs(destFolder)
                                shutil.copyfile(os.path.join(folder, summaryid, fileid), os.path.join(destFolder, summaryid+'_'+fileid))
                                ImpUtils.ExecuteFilterbankSummary_Value(calculationObject, destFolder, summaryid+'_'+fileid, summSettings)
Ejemplo n.º 5
0
def ImportWorkspace(calculationObject, datasetId, workspaceid, folder,
                    importSettings):
    Utils.CheckSafeIdentifier(workspaceid)
    with calculationObject.LogHeader('Importing workspace {0}.{1}'.format(
            datasetId, workspaceid)):
        print('Source directory: ' + folder)
        settings = SettingsLoader.SettingsLoader(
            os.path.join(folder, 'settings'))
        settings.RequireTokens(['Name'])
        print(settings.ToJSON())
        workspaceName = settings['Name']

        if not ImpUtils.IsDatasetPresentInServer(
                calculationObject.credentialInfo, datasetId):
            raise Exception(
                'Dataset {0} is not found. Please import the dataset first'.
                format(datasetId))

        db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo,
                                     datasetId)
        cur = db.cursor()

        def execSQL(cmd):
            calculationObject.LogSQLCommand(cmd)
            cur.execute(cmd)

        cur.execute('SELECT id, primkey, settings FROM tablecatalog')
        tables = [{
            'id': row[0],
            'primkey': row[1],
            'settingsStr': row[2]
        } for row in cur.fetchall()]
        tableMap = {table['id']: table for table in tables}

        for table in tables:
            tableSettings = SettingsLoader.SettingsLoader()
            tableSettings.LoadDict(
                simplejson.loads(table['settingsStr'], strict=False))
            table['settings'] = tableSettings

        if not importSettings['ConfigOnly']:
            for table in tables:
                tableid = table['id']
                print('Re-creating custom data table for ' + tableid)
                execSQL("DROP TABLE IF EXISTS {0}".format(
                    Utils.GetTableWorkspaceProperties(workspaceid, tableid)))
                execSQL(
                    "CREATE TABLE {0} (StoredSelection TINYINT DEFAULT 0) AS SELECT {1} FROM {2}"
                    .format(
                        Utils.GetTableWorkspaceProperties(
                            workspaceid, tableid), table['primkey'], tableid))
                execSQL("create unique index {1} on {0}({1})".format(
                    Utils.GetTableWorkspaceProperties(workspaceid, tableid),
                    table['primkey']))
                execSQL(
                    "create index idx_StoredSelection on {0}(StoredSelection)".
                    format(
                        Utils.GetTableWorkspaceProperties(
                            workspaceid, tableid)))

        print('Removing existing workspace properties')
        execSQL("DELETE FROM propertycatalog WHERE workspaceid='{0}'".format(
            workspaceid))

        calculationObject.Log('Creating StoredSelection columns')
        for table in tables:
            tableid = table['id']
            sett = '{"CanUpdate": true, "Index": false, "ReadData": false, "showInTable": false, "Search":"None" }'
            cmd = "INSERT INTO propertycatalog VALUES ('{0}', 'custom', 'Boolean', 'StoredSelection', '{1}', 'Stored selection', 0, '{2}')".format(
                workspaceid, tableid, sett)
            execSQL(cmd)

        print('Re-creating workspaces record')
        execSQL("DELETE FROM workspaces WHERE id='{0}'".format(workspaceid))
        execSQL('INSERT INTO workspaces VALUES ("{0}","{1}")'.format(
            workspaceid, workspaceName))
        if not importSettings['ConfigOnly']:
            print('Updating views')
            for table in tables:
                Utils.UpdateTableInfoView(
                    workspaceid, table['id'],
                    table['settings']['AllowSubSampling'], cur)

        db.commit()
        db.close()

        print('Scanning for custom data')
        if os.path.exists(os.path.join(folder, 'customdata')):
            for tableid in os.listdir(os.path.join(folder, 'customdata')):
                if os.path.isdir(os.path.join(folder, 'customdata', tableid)):
                    if not tableid in tableMap:
                        raise Exception('Invalid table id ' + tableid)

                    #Read optional settings file
                    customdatalist = None
                    settingsfilename = os.path.join(folder, 'customdata',
                                                    tableid, 'settings')
                    if os.path.isfile(settingsfilename):
                        with open(settingsfilename) as settingsfile:
                            cSettings = SettingsLoader.SettingsLoader(
                                settingsfilename)
                            if cSettings.HasToken('CustomData'):
                                if not type(cSettings['CustomData']) is list:
                                    raise Exception(
                                        'CustomData token should be a list')
                                customdatalist = cSettings['CustomData']
                                print('Custom data list taken from settings')
                    if customdatalist is None:  # Alternatively, just use list of folders
                        customdatalist = []
                        for customid in os.listdir(
                                os.path.join(folder, 'customdata', tableid)):
                            if os.path.isdir(
                                    os.path.join(folder, 'customdata', tableid,
                                                 customid)):
                                customdatalist.append(customid)
                        print('Custom data list taken from folders')
                    print('Custom data list: ' + str(customdatalist))
                    for customid in customdatalist:
                        ImportCustomData(
                            calculationObject, datasetId, workspaceid, tableid,
                            customid,
                            os.path.join(folder, 'customdata', tableid,
                                         customid), importSettings)

        else:
            print('Directory not present')
Ejemplo n.º 6
0
def ImportCustomData(calculationObject, datasetId, workspaceid, tableid,
                     sourceid, folder, importSettings):

    with calculationObject.LogHeader('Importing custom data'):
        print('Importing custom data into {0}, {1}, {2} FROM {3}'.format(
            datasetId, workspaceid, tableid, folder))

        credInfo = calculationObject.credentialInfo

        if not ImpUtils.IsDatasetPresentInServer(
                calculationObject.credentialInfo, datasetId):
            raise Exception(
                'Dataset {0} is not found. Please import the dataset first'.
                format(datasetId))

        db = DQXDbTools.OpenDatabase(credInfo, datasetId)
        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog'))
        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(datasetId, 'workspaces'))

        cur = db.cursor()
        cur.execute(
            'SELECT primkey, settings FROM tablecatalog WHERE id="{0}"'.format(
                tableid))
        row = cur.fetchone()
        if row is None:
            raise Exception(
                'Unable to find table record for table {0} in dataset {1}'.
                format(tableid, datasetId))
        primkey = row[0]
        tableSettingsStr = row[1]
        db.close()

        ImpUtils.ExecuteSQL(
            calculationObject, datasetId,
            'DELETE FROM customdatacatalog WHERE tableid="{tableid}" and sourceid="{sourceid}"'
            .format(tableid=tableid, sourceid=sourceid))

        tableSettings = SettingsLoader.SettingsLoader()
        tableSettings.LoadDict(simplejson.loads(tableSettingsStr,
                                                strict=False))

        allowSubSampling = tableSettings['AllowSubSampling']

        isPositionOnGenome = False
        if tableSettings.HasToken(
                'IsPositionOnGenome') and tableSettings['IsPositionOnGenome']:
            isPositionOnGenome = True
            chromField = tableSettings['Chromosome']
            posField = tableSettings['Position']

        settings = SettingsLoader.SettingsLoader(
            os.path.join(os.path.join(folder, 'settings')))

        ImpUtils.ExecuteSQL(
            calculationObject, datasetId,
            "INSERT INTO customdatacatalog VALUES ('{tableid}', '{sourceid}', '{settings}')"
            .format(tableid=tableid,
                    sourceid=sourceid,
                    settings=settings.ToJSON()))

        properties = ImpUtils.LoadPropertyInfo(calculationObject, settings,
                                               os.path.join(folder, 'data'))

        # remove primary key, just in case
        properties = [prop for prop in properties if prop['propid'] != primkey]

        sourcetable = Utils.GetTableWorkspaceProperties(workspaceid, tableid)
        print('Source table: ' + sourcetable)

        #Get list of existing properties
        db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo,
                                     datasetId)
        cur = db.cursor()
        cur.execute(
            'SELECT propid FROM propertycatalog WHERE (workspaceid="{0}") and (source="custom") and (tableid="{1}")'
            .format(workspaceid, tableid))
        existingProperties = [row[0] for row in cur.fetchall()]
        print('Existing properties: ' + str(existingProperties))
        db.close()

        propidList = []
        for property in properties:
            DQXUtils.CheckValidIdentifier(property['propid'])
            propidList.append(property['propid'])

        db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo,
                                     datasetId)
        cur = db.cursor()

        if not importSettings['ConfigOnly']:
            # Dropping columns that will be replaced
            toRemoveExistingProperties = []
            for existProperty in existingProperties:
                if existProperty in propidList:
                    toRemoveExistingProperties.append(existProperty)
            print('Removing outdated information:')
            if len(toRemoveExistingProperties) > 0:
                for prop in toRemoveExistingProperties:
                    print('Removing outdated information: {0} {1} {2}'.format(
                        workspaceid, prop, tableid))
                    sql = 'DELETE FROM propertycatalog WHERE (workspaceid="{0}") and (propid="{1}") and (tableid="{2}")'.format(
                        workspaceid, prop, tableid)
                    print(sql)
                    cur.execute(sql)
                sql = "ALTER TABLE {0} ".format(sourcetable)
                for prop in toRemoveExistingProperties:
                    if prop != toRemoveExistingProperties[0]:
                        sql += ", "
                    sql += "DROP COLUMN {0}".format(prop)
                calculationObject.LogSQLCommand(sql)
                cur.execute(sql)

        ranknr = 0
        for property in properties:
            propid = property['propid']
            settings = property['Settings']
            extraSettings = settings.Clone()
            extraSettings.DropTokens(
                ['Name', 'DataType', 'Order', 'SummaryValues'])
            print('Create property catalog entry for {0} {1} {2}'.format(
                workspaceid, tableid, propid))
            sql = "DELETE FROM propertycatalog WHERE (workspaceid='{0}') and (propid='{1}') and (tableid='{2}')".format(
                workspaceid, propid, tableid)
            ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            sql = "INSERT INTO propertycatalog VALUES ('{0}', 'custom', '{1}', '{2}', '{3}', '{4}', {5}, '{6}')".format(
                workspaceid, settings['DataType'], propid, tableid,
                settings['Name'], 0, extraSettings.ToJSON())
            ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            ranknr += 1

        propDict = {}
        for property in properties:
            propDict[property['propid']] = property

        if not importSettings['ConfigOnly']:
            tmptable = Utils.GetTempID()
            columns = [{
                'name': prop['propid'],
                'DataType': prop['DataType'],
                'Index': prop['Settings']['Index'],
                'ReadData': prop['Settings']['ReadData']
            } for prop in properties]
            columns.append({
                'name': primkey,
                'DataType': 'Text',
                'Index': False,
                'ReadData': True
            })
            # print('----------------------------------------------------------------')
            # print(str(columns))
            # print('----------------------------------------------------------------')
            LoadTable.LoadTable(calculationObject,
                                os.path.join(folder, 'data'), datasetId,
                                tmptable, columns, {'PrimKey': primkey},
                                importSettings, False)

            print('Creating new columns')
            calculationObject.Log(
                'WARNING: better mechanism to determine column types needed here'
            )  #TODO: implement
            frst = True
            sql = "ALTER TABLE {0} ".format(sourcetable)
            for property in properties:
                propid = property['propid']
                if not frst:
                    sql += " ,"
                sqldatatype = ImpUtils.GetSQLDataType(property['DataType'])
                sql += "ADD COLUMN {0} {1}".format(propid, sqldatatype)
                frst = False
                calculationObject.LogSQLCommand(sql)
            cur.execute(sql)

            print('Joining information')
            frst = True
            credInfo.VerifyCanDo(
                DQXDbTools.DbOperationWrite(datasetId, sourcetable))
            sql = "update {0} left join {1} on {0}.{2}={1}.{2} set ".format(
                sourcetable, tmptable, primkey)
            for property in properties:
                propid = property['propid']
                if not frst:
                    sql += " ,"
                sql += "{0}.{2}={1}.{2}".format(sourcetable, tmptable, propid)
                frst = False
                calculationObject.LogSQLCommand(sql)
            cur.execute(sql)

            print('Cleaning up')
            cur.execute("DROP TABLE {0}".format(tmptable))

        if not importSettings['ConfigOnly']:
            print('Updating view')
            Utils.UpdateTableInfoView(workspaceid, tableid, allowSubSampling,
                                      cur)

        db.commit()
        db.close()

        print('Creating summary values')
        for property in properties:
            propid = property['propid']
            settings = property['Settings']
            if settings.HasToken('SummaryValues'):
                with calculationObject.LogHeader(
                        'Creating summary values for custom data {0}'.format(
                            tableid)):
                    summSettings = settings.GetSubSettings('SummaryValues')
                    if settings.HasToken('minval'):
                        summSettings.AddTokenIfMissing('MinVal',
                                                       settings['minval'])
                    summSettings.AddTokenIfMissing('MaxVal',
                                                   settings['maxval'])
                    destFolder = os.path.join(config.BASEDIR, 'SummaryTracks',
                                              datasetId, propid)
                    if not os.path.exists(destFolder):
                        os.makedirs(destFolder)
                    dataFileName = os.path.join(destFolder, propid)

                    if not isPositionOnGenome:
                        raise Exception(
                            'Summary values defined for non-position table')

                    if not importSettings['ConfigOnly']:
                        calculationObject.Log('Extracting data to ' +
                                              dataFileName)
                        script = ImpUtils.SQLScript(calculationObject)
                        script.AddCommand(
                            "SELECT {2} as chrom, {3} as pos, {0} FROM {1} ORDER BY {2},{3}"
                            .format(
                                propid,
                                Utils.GetTableWorkspaceView(
                                    workspaceid, tableid), chromField,
                                posField))
                        script.Execute(datasetId, dataFileName)
                        calculationObject.LogFileTop(dataFileName, 10)

                    ImpUtils.CreateSummaryValues_Value(
                        calculationObject, summSettings, datasetId, tableid,
                        'custom', workspaceid, propid, settings['Name'],
                        dataFileName, importSettings)
Ejemplo n.º 7
0
def ImportDataSet(calculationObject, baseFolder, datasetId, importSettings):
    Utils.CheckSafeIdentifier(datasetId)
    with calculationObject.LogHeader(
            'Importing dataset {0}'.format(datasetId)):
        calculationObject.Log('Import settings: ' + str(importSettings))
        DQXUtils.CheckValidIdentifier(datasetId)
        datasetFolder = os.path.join(baseFolder, datasetId)
        indexDb = config.DB

        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(indexDb, 'datasetindex'))
        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(datasetId))

        # Remove current reference in the index first: if import fails, nothing will show up
        ImpUtils.ExecuteSQL(
            calculationObject, indexDb,
            'DELETE FROM datasetindex WHERE id="{0}"'.format(datasetId))

        globalSettings = SettingsLoader.SettingsLoader(
            os.path.join(datasetFolder, 'settings'))
        globalSettings.RequireTokens(['Name'])
        globalSettings.AddTokenIfMissing('Description', '')

        print('Global settings: ' + str(globalSettings.Get()))

        if not importSettings['ConfigOnly']:
            # Dropping existing database
            calculationObject.SetInfo('Dropping database')
            print('Dropping database')
            try:
                ImpUtils.ExecuteSQL(
                    calculationObject, indexDb,
                    'DROP DATABASE IF EXISTS {0}'.format(datasetId))
            except:
                pass
            ImpUtils.ExecuteSQL(calculationObject, indexDb,
                                'CREATE DATABASE {0}'.format(datasetId))

            # Creating new database
            scriptPath = os.path.dirname(os.path.realpath(__file__))
            calculationObject.SetInfo('Creating database')
            print('Creating new database')
            ImpUtils.ExecuteSQLScript(calculationObject,
                                      scriptPath + '/createdataset.sql',
                                      datasetId)

        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM propertycatalog')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM summaryvalues')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM tablecatalog')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM settings')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM customdatacatalog')

        datatables = []

        if globalSettings.HasToken('DataTables'):
            if not type(globalSettings['DataTables']) is list:
                raise Exception('DataTables token should be a list')
            datatables = globalSettings['DataTables']

        for dir in os.listdir(os.path.join(datasetFolder, 'datatables')):
            if os.path.isdir(os.path.join(datasetFolder, 'datatables', dir)):
                if dir not in datatables:
                    datatables.append(dir)
        print('Data tables: ' + str(datatables))
        for datatable in datatables:
            ImportDataTable.ImportDataTable(
                calculationObject, datasetId, datatable,
                os.path.join(datasetFolder, 'datatables', datatable),
                importSettings)

        try:
            datatables_2D = globalSettings['2D_DataTables']
        except KeyError:
            datatables_2D = []
        if type(datatables_2D) is not list:
            raise TypeError('2D_DataTables token should be a list')
        for datatable in datatables_2D:
            Import2DDataTable.ImportDataTable(
                calculationObject, datasetId, datatable,
                os.path.join(datasetFolder, '2D_datatables', datatable),
                importSettings)

        if os.path.exists(os.path.join(datasetFolder, 'refgenome')):
            ImportRefGenome.ImportRefGenome(
                calculationObject, datasetId,
                os.path.join(datasetFolder, 'refgenome'), importSettings)
            globalSettings.AddTokenIfMissing('hasGenomeBrowser', True)

        ImportWorkspaces.ImportWorkspaces(calculationObject, datasetFolder,
                                          datasetId, importSettings)

        # Global settings
        print('Defining global settings')
        ImpUtils.ImportGlobalSettings(calculationObject, datasetId,
                                      globalSettings)

        # Finalise: register dataset
        print('Registering dataset')
        importtime = 0
        if not importSettings['ConfigOnly']:
            importtime = time.time()
        ImpUtils.ExecuteSQL(
            calculationObject, indexDb,
            'INSERT INTO datasetindex VALUES ("{0}", "{1}", "{2}")'.format(
                datasetId, globalSettings['Name'], str(math.ceil(importtime))))
Ejemplo n.º 8
0
    def ImportDataTable(self, tableid):
              
        with self._calculationObject.LogHeader('Importing 2D datatable {0}'.format(tableid)):

            DQXUtils.CheckValidTableIdentifier(tableid)
            self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId, '2D_tablecatalog'))
            self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId, '2D_propertycatalog'))
            
            max_line_count = None
            if self._maxLineCount > 0:
                max_line_count = self._maxLineCount
                
            table_settings = self.getSettings(tableid)
 
            
    
            settingsFile, dataFile = self._getDataFiles(tableid)
            remote_hdf5 = h5py.File(dataFile, 'r')
            #Check that the referenced tables exist and have the primary key specified.
            if table_settings['ColumnDataTable']:
                tables = self._getTablesInfo(table_settings['ColumnDataTable'])
                cat_id = tables[0]["id"]
                sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['ColumnIndexField'],
                                                           table_settings['ColumnDataTable'])
                try:
                    idx_field = self._execSqlQuery(sql)
                except:
                    raise Exception(table_settings['ColumnIndexField'] + " column index field doesn't exist in table " + table_settings['ColumnDataTable'])
            if table_settings['RowDataTable']:
                tables = self._getTablesInfo(table_settings['RowDataTable'])
                cat_id = tables[0]["id"]

                sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['RowIndexField'],
                                                           table_settings['RowDataTable'])
                try:
                    idx_field = self._execSqlQuery(sql)
                except:
                    raise Exception(table_settings['RowIndexField'] + " row index field doesn't exist in table " + table_settings['RowDataTable'])
    
            if table_settings['ShowInGenomeBrowser']:
                sql = "SELECT IsPositionOnGenome FROM tablecatalog WHERE id='{0}' ".format(table_settings['ColumnDataTable'])
                is_position = self._execSqlQuery(sql)[0][0]
                if not is_position:
                    raise Exception(table_settings['ColumnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.')
            if table_settings['FirstArrayDimension'] not in ['column', 'row']:
                raise Exception("FirstArrayDimension must be column or row")
    
            # Add to tablecatalog
            sql = "INSERT INTO 2D_tablecatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6})".format(
                tableid,
                table_settings['NamePlural'],
                table_settings['ColumnDataTable'],
                table_settings['RowDataTable'],
                table_settings['FirstArrayDimension'],
                table_settings.serialize(),
                self.tableOrder
            )
            self._execSql(sql)
            self.tableOrder += 1
    
            for propname in table_settings.getPropertyNames():
                propid = table_settings.getPropertyValue(propname,'Id')
                dtype = arraybuffer._strict_dtype_string(remote_hdf5[propid].dtype)
                arity = 1 if len(remote_hdf5[propid].shape) == 2 else remote_hdf5[propid].shape[2]
                sql = "INSERT INTO 2D_propertycatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', {5}, '{6}', '{7}', {8})".format(
                    propid,
                    tableid,
                    table_settings['ColumnDataTable'],
                    table_settings['RowDataTable'],
                    table_settings.getPropertyValue(propname,'Name'),
                    self.property_order,
                    dtype,
                    table_settings.serializeProperty(propname),
                    arity
                )
                self._execSql(sql)
                self.property_order += 1
    
            if not self._importSettings['ConfigOnly']:
                #Insert an index column into the index tables
                if table_settings['ColumnDataTable']:
                    # Assume that index field has been created on import in LoadTable - it's much faster
                    # We could just run the command and ignore the error raised if it already exists
                    # sql = "ALTER TABLE `{0}` ADD `{1}_column_index` INT DEFAULT NULL;".format(table_settings['ColumnDataTable'], tableid)
                    # self._execSql(sql)
                    if table_settings['ColumnIndexArray']:
                        #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL
                        #Firstly create a temporary table with the index array
                        try:
                            column_index = remote_hdf5[table_settings['ColumnIndexArray']]
                        except KeyError:
                            raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['ColumnIndexArray']))
                        for prop in table_settings['Properties']:
                            if len(column_index) != remote_hdf5[prop['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'column' else 1]:
                                raise Exception("Property {0} has a different column length to the column index".format(property))
                            
                        #TempColIndex should really be a TEMPORARY table
                        self._dropTable('`TempColIndex`')
                        sql = ImpUtils.Numpy_to_SQL().create_table('TempColIndex', table_settings['ColumnIndexField'], column_index[0:max_line_count])
                        ImpUtils.ExecuteSQLGenerator(self._calculationObject, self._datasetId, sql)
    
                        #Add an index to the table - catch the exception if it exists.
                        sql = "ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;".format(
                            table_settings['ColumnDataTable'],
                            table_settings['ColumnIndexField'],
                            tableid)
                        try:
                            self._execSql(sql)
                        except OperationalError as e:
                            if e[0] != 1060:
                                raise e
    
                        # We have a datatable - add an index to it then copy that index across to the data table
                        sql = """ALTER TABLE `TempColIndex` ADD `index` INT DEFAULT NULL;
                                 SELECT @i:=-1;UPDATE `TempColIndex` SET `index` = @i:=@i+1;
                                 UPDATE `{0}` INNER JOIN `TempColIndex` ON `{0}`.`{1}` = `TempColIndex`.`{1}` SET `{0}`.`{2}_column_index` = `TempColIndex`.`index`;
                                 """.format(
                            table_settings['ColumnDataTable'],
                            table_settings['ColumnIndexField'],
                            tableid)
                        self._execSql(sql)
                        self._dropTable('`TempColIndex`')
                        #Now check we have no NULLS
                        sql = "SELECT `{1}_column_index` from `{0}` where `{1}_column_index` IS NULL".format(
                            table_settings['ColumnDataTable'],
                            tableid)
                        nulls = self._execSqlQuery(sql)
                        if len(nulls) > 0:
                            print("WARNING:Not all rows in {0} have a corresponding column in 2D datatable {1}".format(table_settings['ColumnDataTable'], tableid))
                    else:
                        #Add an index to the table - catch the exception if it exists.
                        sql = "ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;".format(
                            table_settings['ColumnDataTable'],
                            table_settings['ColumnIndexField'],
                            tableid)
                        try:
                            self._execSql(sql)
                        except OperationalError as e:
                            if e[0] != 1060:
                                raise e

                        #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "ColumnIndexField" so we index by that column in order
                        if max_line_count:
                            sql = "SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}` LIMIT {3};"
                        else:
                            sql = "SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}`;"

                        sql = sql.format(
                            table_settings['ColumnDataTable'],
                            table_settings['ColumnIndexField'],
                            tableid, max_line_count)
                        self._execSql(sql)
    
                if table_settings['RowDataTable']:
                    #Add an index to the table - catch the exception if it exists.
                    sql = "ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL;".format(
                        table_settings['RowDataTable'],
                        table_settings['RowIndexField'],
                        tableid)
                    try:
                        self._execSql(sql)
                    except OperationalError as e:
                        if e[0] != 1060:
                            raise e
                                                
                    if table_settings['RowIndexArray']:
                        #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL
                        #Firstly create a temporay table with the index array
                        try:
                            row_index = remote_hdf5[table_settings['RowIndexArray']]
                        except KeyError:
                            raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['RowIndexArray']))
                        for prop in table_settings['Properties']:
                            if len(row_index) != remote_hdf5[prop['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'row' else 1]:
                                raise Exception("Property {0} has a different row length to the row index".format(property))
                        self._dropTable('`TempRowIndex`')
                        sql = ImpUtils.Numpy_to_SQL().create_table('TempRowIndex', table_settings['RowIndexField'], row_index)
                        ImpUtils.ExecuteSQLGenerator(self._calculationObject, self._datasetId, sql)

                        #We have a datatable - add an index to it then copy that index across to the data table
                        sql = """ALTER TABLE `TempRowIndex` ADD `index` INT DEFAULT NULL;
                                 SELECT @i:=-1;UPDATE `TempRowIndex` SET `index` = @i:=@i+1;
                                 UPDATE `{0}` INNER JOIN `TempRowIndex` ON `{0}`.`{1}` = `TempRowIndex`.`{1}` SET `{0}`.`{2}_row_index` = `TempRowIndex`.`index`;
                                 """.format(
                            table_settings['RowDataTable'],
                            table_settings['RowIndexField'],
                            tableid)
                        self._execSql(sql)
                        self._dropTable('`TempRowIndex`')
                        #Now check we have no NULLS
                        sql = "SELECT `{1}_row_index` from `{0}` where `{1}_row_index` IS NULL".format(
                            table_settings['RowDataTable'],
                            tableid)
                        nulls = self._execSqlQuery(sql)
                        if len(nulls) > 0:
                            print("WARNING: Not all rows in {0} have a corresponding row in 2D datatable {1}".format(table_settings['RowDataTable'], tableid))
                    else:
                        
                        #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "RowIndexField" so we index by that column in order
                        sql = """SELECT @i:=-1;UPDATE `{0}` SET `{2}_row_index` = @i:=@i+1 ORDER BY `{1}`;
                                 """.format(
                            table_settings['RowDataTable'],
                            table_settings['RowIndexField'],
                            tableid)
                        self._execSql(sql)
    
    
                #We have the indexes - now we need a local copy of the HDF5 data for each property
                ImpUtils.mkdir(os.path.join(config.BASEDIR, '2D_data'))
                path_join = os.path.join(config.BASEDIR, '2D_data', self._datasetId + '_' + tableid + '.hdf5')
                try:
                    os.remove(path_join)
                except OSError:
                    pass
                if table_settings['SymlinkData']:
                    print "Symlinking datasets - will only work on unix"
                    os.symlink(dataFile, path_join)
                else:
                    local_hdf5 = h5py.File(path_join, 'w', libver='latest')
                    print "Copying HDF5 datasets"
                    for prop in table_settings['Properties']:
                        print "..", prop
                        prop_in = remote_hdf5[prop['Id']]
                        #Make some choices assuming data is variants/samples
                        if prop_in.shape[0] > prop_in.shape[1]:
                            chunks = [min(1000, prop_in.shape[0]), min(10, prop_in.shape[1])]
                        else:
                            chunks = [min(10, prop_in.shape[0]), min(1000, prop_in.shape[1])]
                        arity = 1 if len(prop_in.shape) == 2 else prop_in.shape[2]
                        if arity > 1:
                            chunks.append(arity)
                        prop_out = local_hdf5.create_dataset(prop['Id'], prop_in.shape, prop_in.dtype, chunks=tuple(chunks), maxshape=prop_in.shape, compression='gzip', fletcher32=False, shuffle=False)
                        self._hdf5_copy(prop_in, prop_out, limit=(None, max_line_count) if table_settings['FirstArrayDimension'] == 'row' else (max_line_count, None))
                        print "done"
                    print "all copies complete"
                    local_hdf5.close()
                remote_hdf5.close()
Ejemplo n.º 9
0
def LoadTable0(calculationObject, sourceFileName, databaseid, tableid, columns, loadSettings):

    calculationObject.Log('Loading table {0} from {1}'.format(tableid, sourceFileName))

    colDict = {col['name']: col for col in columns}
    colNameList = [col['name'] for col in columns]
    print('Column info: '+str(columns))


    tb = VTTable.VTTable()
    tb.allColumnsText = True
    try:
        tb.LoadFile(sourceFileName, loadSettings['MaxTableSize'])
    except Exception as e:
        raise Exception('Error while reading file: '+str(e))
    calculationObject.Log('---- ORIG TABLE ----')
    with calculationObject.LogDataDump():
        tb.PrintRows(0, 9)

    for col in columns:
        if not tb.IsColumnPresent(col['name']):
            raise Exception('Missing column "{0}" in datatable "{1}"'.format(col['name'], tableid))

    if loadSettings['PrimKey'] not in colDict:
        raise Exception('Missing primary key column "{0}" in datatable "{1}"'.format(loadSettings['PrimKey'], tableid))

    for col in tb.GetColList():
        if col not in colDict:
            tb.DropCol(col)
    tb.ArrangeColumns(colNameList)
    for col in columns:
        colname = col['name']
        if ImpUtils.IsValueDataTypeIdenfifier(col['DataType']):
            tb.ConvertColToValue(colname)
        if col['DataType'] == 'Boolean':
            tb.MapCol(colname, ImpUtils.convertToBooleanInt)
            tb.ConvertColToValue(colname)
    calculationObject.Log('---- PROCESSED TABLE ----')
    with calculationObject.LogDataDump():
        tb.PrintRows(0, 9)

    createcmd = 'CREATE TABLE {0} ('.format(tableid)
    frst = True
    for col in columns:
        if not frst:
            createcmd += ', '
        colname = col['name']
        colnr = tb.GetColNr(colname)
        datatypestr = ''
        if col['DataType'] == 'Text':
            maxlength = 1
            for rownr in tb.GetRowNrRange():
                maxlength = max(maxlength, len(tb.GetValue(rownr, colnr)))
            datatypestr = 'varchar({0})'.format(maxlength)
        if len(datatypestr) == 0:
            datatypestr = ImpUtils.GetSQLDataType(col['DataType'])
        createcmd += colname + ' ' + datatypestr
        frst = False
    createcmd += ')'

    calculationObject.Log('Creating datatable')
    scr = ImpUtils.SQLScript(calculationObject)
    scr.AddCommand('drop table if exists {0}'.format(tableid))
    scr.AddCommand(createcmd)
    scr.AddCommand('create unique index {0}_{1} ON {0}({1})'.format(tableid, loadSettings['PrimKey']))
    scr.Execute(databaseid)

    calculationObject.Log('Loading datatable values')
    sqlfile = ImpUtils.GetTempFileName()
    tb.SaveSQLDump(sqlfile, tableid)
    ImpUtils.ExecuteSQLScript(calculationObject, sqlfile, databaseid)
    os.remove(sqlfile)
Ejemplo n.º 10
0
def LoadTable(calculationObject, sourceFileName, databaseid, tableid, columns, loadSettings, importSettings, allowSubSampling):

    def DecoId(id):
        return '`' + id + '`'

    def EncodeCell(icontent, col):
        content = icontent
        if col['IsString']:
            if len(icontent) == 0:
                content = "''"
            else:
                try:
                    content = content.encode('ascii', 'ignore')
                except UnicodeDecodeError:
                    print('Unable to encode '+content)
                    content='*failed encoding*'
                content = content.replace("\x92", "'")
                content = content.replace("\xC2", "'")
                content = content.replace("\x91", "'")
                #filter(lambda x: x in string.printable, val)
                content = content.replace("'", "\\'")
                content = content.replace('\r\n', '\\n')
                content = content.replace('\n\r', '\\n')
                content = '\'' + content + '\''

        if col['IsValue']:
            if (content == 'NA') or (content == '') or (content == 'None') or (content == 'NULL') or (content == 'null') or (content == 'inf') or (content == '-'):
                content = 'NULL'

        if col['IsDate']:
            if len(content)>=5:
                try:
                    dt = dateutil.parser.parse(content)
                    tmdiff  =(dt - datetime.datetime(1970, 1, 1)).days
                    tmdiff += 2440587.5 +0.5 # note: extra 0.5 because we set datestamp at noon
                    content = str(tmdiff)
                except:
                    print('ERROR: date parsing string '+content)
                    content = 'NULL'
            else:
                content = 'NULL'

        if col['IsBoolean']:
            vl = content
            content = 'NULL'
            if (vl.lower() == 'true') or (vl.lower() == 'yes') or (vl.lower() == 'y') or (vl == '1'):
                content = '1'
            if (vl.lower() == 'false') or (vl.lower() == 'no') or (vl.lower() == 'n') or (vl == '0'):
                content = '0'

        return content

    calculationObject.Log('Loading table {0} from {1}'.format(tableid, sourceFileName))
    colDict = {col['name']: col for col in columns}
    colNameList = [col['name'] for col in columns]
    primkey = loadSettings['PrimKey']
    autoPrimKey = (primkey == 'AutoKey')
    print('Column info: ' + str(columns))
    print('Primary key: ' + primkey)

    for col in columns:
        col['IsString'] = (col['DataType'] == 'Text')
        col['IsValue'] = ImpUtils.IsValueDataTypeIdenfifier(col['DataType'])
        col['IsDate'] = ImpUtils.IsDateDataTypeIdenfifier(col['DataType'])
        col['IsBoolean'] = (col['DataType'] == 'Boolean')
        col['MaxLen'] = 0

    destFileName = ImpUtils.GetTempFileName()

    maxLineCount = -1
    if importSettings['ScopeStr'] == '1k':
        maxLineCount = 1000
    if importSettings['ScopeStr'] == '10k':
        maxLineCount = 10000
    if importSettings['ScopeStr'] == '100k':
        maxLineCount = 100000
    if importSettings['ScopeStr'] == '1M':
        maxLineCount = 1000000
    if importSettings['ScopeStr'] == '10M':
        maxLineCount = 10000000

    with open(sourceFileName, 'r') as ifp:
        if ifp is None:
            raise Exception('Unable to read file '+sourceFileName)
        with open(destFileName, 'w') as ofp:
            if ofp is None:
                raise Exception('Unable to write to temporary file ' + destFileName)
            fileColNames = ifp.readline().rstrip('\n\r').split('\t')
            calculationObject.Log('File columns: ' + str(fileColNames))
            fileColIndex = {fileColNames[i]: i for i in range(len(fileColNames))}
            if not(autoPrimKey) and (primkey not in fileColIndex):
                raise Exception('File is missing primary key '+primkey)
            for col in columns:
                # if 'ReadData' not in col:
                #     print('==========' + str(col))
                colname = col['name']
                if (col['ReadData']) and (colname not in fileColIndex):
                    raise Exception('File is missing column '+colname)

            blockSize = 499
            blockStarted = False

            lineCount = 0
            for line in ifp:
                line = line.rstrip('\r\n')
                if len(line) > 0:
                    sourceCells = line.split('\t')
                    writeCells = []
                    for col in columns:
                        content = 'NULL'
                        if col['name'] in fileColIndex:
                            content = sourceCells[fileColIndex[col['name']]]
                            content = EncodeCell(content, col)
                        writeCells.append(content)
                        if col['IsString']:
                            col['MaxLen'] = max(col['MaxLen'], len(content))

                    if not(blockStarted):
                        ofp.write('INSERT INTO {0} ({1}) VALUES '.format(DecoId(tableid), ', '.join([DecoId(col) for col in colNameList])))
                        blockStarted = True
                        blockNr = 0

                    if blockNr > 0:
                        ofp.write(',')
                    ofp.write('(')
                    ofp.write(','.join(writeCells))
                    ofp.write(')')
                    blockNr += 1
                    if blockNr >= blockSize:
                        ofp.write(';\n')
                        blockStarted = False
                    lineCount += 1
                    if lineCount % 250000 == 0:
                        calculationObject.Log('Line '+str(lineCount))
                    if (maxLineCount>0) and (lineCount >= maxLineCount):
                        calculationObject.Log('WARNING:Terminating import at {0} lines'.format(lineCount))
                        break

    calculationObject.Log('Column sizes: '+str({col['name']: col['MaxLen'] for col in columns}))

    calculationObject.Log('Creating schema')
    scr = ImpUtils.SQLScript(calculationObject)
    scr.AddCommand('drop table if exists {0};'.format(DecoId(tableid)))
    sql = 'CREATE TABLE {0} (\n'.format(DecoId(tableid))
    colTokens = []
    if autoPrimKey:
        colTokens.append("{0} int AUTO_INCREMENT PRIMARY KEY".format(primkey))
    if allowSubSampling:
        colTokens.append("_randomval_ double")
    for col in columns:
        st = DecoId(col['name'])
        typestr = ''
        if col['DataType'] == 'Text':
            typestr = 'varchar({0})'.format(max(1, col['MaxLen']))
        if len(typestr) == 0:
            typestr = ImpUtils.GetSQLDataType(col['DataType'])
        if len(typestr) == 0:
            raise Exception('Invalid property data type ' + col['DataType'])
        st += ' '+typestr
        colTokens.append(st)
    sql += ', '.join(colTokens)
    sql +=')'
    scr.AddCommand(sql)
    calculationObject.Log('Creation statement: '+sql)
    if not(autoPrimKey):
        scr.AddCommand('create unique index {0}_{1} ON {0}({1})'.format(tableid, primkey))
    for col in columns:
        if ('Index' in col) and (col['Index']) and (col['name'] != primkey):
            scr.AddCommand('create index {0}_{1} ON {0}({1})'.format(tableid, col['name']))
    scr.Execute(databaseid)

    calculationObject.Log('Importing data')
    ImpUtils.ExecuteSQLScript(calculationObject, destFileName, databaseid)

    if allowSubSampling:
        with calculationObject.LogHeader('Create subsampling table'):
            calculationObject.Log('Creating random data column')
            sql = "UPDATE {0} SET _randomval_=RAND()".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "CREATE INDEX {0}_randomindex ON {0}(_randomval_)".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "DROP TABLE IF EXISTS {0}_SORTRAND".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "CREATE TABLE {0}_SORTRAND LIKE {0}".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            if autoPrimKey:
                calculationObject.Log('Restructuring AutoKey')
                sql = "alter table {0}_SORTRAND drop column AutoKey".format(tableid)
                ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
                sql = "alter table {0}_SORTRAND add column AutoKey int FIRST".format(tableid)
                ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
                sql = "create index idx_autokey on {0}_SORTRAND(AutoKey)".format(tableid)
                ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "alter table {0}_SORTRAND add column RandPrimKey int AUTO_INCREMENT PRIMARY KEY".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "insert into {0}_SORTRAND select *,0 from {0} order by _randomval_".format(tableid)
            sql += ' LIMIT 5000000' # NOTE: there is little point in importing more than that!
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)


    os.remove(destFileName)
Ejemplo n.º 11
0
def ImportRefGenome(calculationObject, datasetId, folder, importSettings):
    with calculationObject.LogHeader('Importing reference genome data'):

        ImportRefGenomeSummaryData(calculationObject, datasetId, folder,
                                   importSettings)

        settings = SettingsLoader.SettingsLoader(
            os.path.join(folder, 'settings'))
        settings.DefineKnownTokens(['AnnotMaxViewportSize', 'RefSequenceSumm'])
        print('Settings: ' + str(settings.Get()))
        ImpUtils.ImportGlobalSettings(calculationObject, datasetId, settings)

        # Import reference genome
        if importSettings['ScopeStr'] == 'all':
            refsequencefile = os.path.join(folder, 'refsequence.fa')
            if os.path.exists(refsequencefile):
                with calculationObject.LogHeader(
                        'Converting reference genome'):
                    destfolder = config.BASEDIR + '/SummaryTracks/' + datasetId + '/Sequence'
                    if not os.path.exists(destfolder):
                        os.makedirs(destfolder)
                    tempfastafile = destfolder + '/refsequence.fa'
                    shutil.copyfile(refsequencefile, tempfastafile)
                    ImpUtils.RunConvertor(calculationObject,
                                          'Fasta2FilterBankData', destfolder,
                                          ['refsequence.fa'])
            else:
                calculationObject.Log(
                    'WARNING: missing reference sequence file')
        else:
            calculationObject.Log(
                'WARNING: Skipping converting reference genome')

        # Import chromosomes
        with calculationObject.LogHeader('Loading chromosomes'):
            tb = VTTable.VTTable()
            tb.allColumnsText = True
            try:
                tb.LoadFile(os.path.join(folder, 'chromosomes'))
            except Exception as e:
                raise Exception('Error while reading chromosomes file: ' +
                                str(e))
            tb.RequireColumnSet(['chrom', 'length'])
            tb.RenameCol('chrom', 'id')
            tb.RenameCol('length', 'len')
            tb.ConvertColToValue('len')
            with calculationObject.LogDataDump():
                tb.PrintRows(0, 99)
            sqlfile = ImpUtils.GetTempFileName()
            tb.SaveSQLDump(sqlfile, 'chromosomes')
            ImpUtils.ExecuteSQL(calculationObject, datasetId,
                                'DELETE FROM chromosomes')
            ImpUtils.ExecuteSQLScript(calculationObject, sqlfile, datasetId)
            os.remove(sqlfile)

        if importSettings['ScopeStr'] == 'all':
            # Import annotation
            with calculationObject.LogHeader('Converting annotation'):
                tempgfffile = ImpUtils.GetTempFileName()
                temppath = os.path.dirname(tempgfffile)
                shutil.copyfile(os.path.join(folder, 'annotation.gff'),
                                tempgfffile)
                ImpUtils.RunConvertor(calculationObject, 'ParseGFF', temppath,
                                      [os.path.basename(tempgfffile)])
                print('Importing annotation')
                ImpUtils.ExecuteSQLScript(
                    calculationObject,
                    os.path.join(temppath, 'annotation_dump.sql'), datasetId)
                os.remove(tempgfffile)
                os.remove(os.path.join(temppath, 'annotation.txt'))
                os.remove(os.path.join(temppath, 'annotation_dump.sql'))
                os.remove(os.path.join(temppath, 'annotation_create.sql'))
        else:
            calculationObject.Log('WARNING: Skipping converting annotation')