def _createDatabase(self, destDir, ds_file): tabixIndexedFile = TabixIndexer.index(destDir=destDir, inputFilename=ds_file, preset="vcf") baseDSFile = os.path.basename(tabixIndexedFile) return baseDSFile
def _createDatabase( self, destDir, ds_file, ds_match_mode, index_column_names, annotation_column_names, column_names ): index_columns = [] for index_column_name in index_column_names: # ensures that all index columns are in the column names list index_columns += [column_names.index(index_column_name)] if len(index_columns) != 3 and len(index_columns) != 5: raise ValueError("Wrong number of index columns. Must be a comma separated list of length 3 or 5.") column_names = set(column_names) annotation_column_names = set(annotation_column_names) # Read the column names and determine whether they exist or not in the file data = pandas.read_csv(filepath_or_buffer=ds_file, delimiter="\t", iterator=True, chunksize=1) for chunk in data: index = chunk.columns fieldNames = set(index) missingColumns = fieldNames.difference(column_names) if len(missingColumns) != 0: msg = "The input tsv, %s, is missing the following columns: %s." % ( ds_file, string.join(missingColumns, ", "), ) raise InputMismatchException(msg) missingColumns = annotation_column_names.difference(fieldNames) if len(missingColumns) != 0: msg = "The input tsv, %s, is missing the following annotation columns: %s." % ( ds_file, string.join(missingColumns), ) raise InputMismatchException(msg) break # Iterate through the file and determine column's data type data = pandas.read_csv( filepath_or_buffer=ds_file, delimiter="\t", iterator=True, chunksize=10000, usecols=annotation_column_names, na_values=["", ".", "-"], ) for chunk in data: index = chunk.columns # Missing values default to float data type for idx in index: if ds_match_mode != "exact" or len(index_column_names) != 3: if numpy.issubdtype(chunk[idx].dtype, numpy.inexact): if idx not in self.columnDataTypes or self.columnDataTypes[idx] not in ("String",): self.columnDataTypes[idx] = "Float" elif numpy.issubdtype(chunk[idx].dtype, numpy.integer): if idx not in self.columnDataTypes or self.columnDataTypes[idx] not in ("Float", "String"): self.columnDataTypes[idx] = "Integer" elif numpy.issubdtype(chunk[idx].dtype, numpy.bool_): self.columnDataTypes[idx] = "Flag" else: self.columnDataTypes[idx] = "String" else: self.columnDataTypes[idx] = "String" if ds_match_mode != "exact" or len(index_column_names) != 3: break tabixIndexedFile = TabixIndexer.index( destDir=destDir, inputFilename=ds_file, fileColumnNumList=index_columns, preset="tsv" ) baseDSFile = os.path.basename(tabixIndexedFile) logging.getLogger(__name__).info("%s file was created." % tabixIndexedFile) return baseDSFile